From 1041c54b73e8256cc641e0693bf9aa85f3efee61 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 05:15:14 -0600 Subject: [PATCH 1/8] feat(bench): GEPA over the analyst/steerer prompt on the canonical stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The analyst IS the steerer (observe()'s findings → recommended_action → the depth steer), so optimizing the analyst prompt optimizes the loop. This evolves it with agent-eval's REAL GEPA primitives (buildReflectionPrompt + parseReflectionResponse + paretoFrontier) — no hand-rolled optimizer; there is no turnkey runPromptEvolution in agent-eval 0.83, only the primitives, so the population loop is thin orchestration over them. - observe(): + analystInstruction? override (the analyst prompt is now the GEPA knob); defaultAnalystInstruction exported. Firewall stays structural (input has no score). - agentic.ts: AgenticOptions.analystInstruction threads into the depth steerer. - eops-gepa.mts: FITNESS = depth-vs-breadth lift on the canonical Supervisor+observe gate; breadth computed ONCE per task (shared baseline, correct + halves cost); failing per-task lifts = the reflection gradient. Seeds = observe()'s PROVEN default (the +16.4pp instruction) FIRST, then the designer-panel population. Smoke (N=2, 1 gen) validated the full loop: score → paretoFrontier select → reflect → mutate → re-score → pick. Bounded real run (N=6, 2 gens) in flight. --- bench/src/agentic.ts | 5 +- bench/src/eops-gepa.mts | 177 ++++++++++++++++++++++++++++++++++++++++ src/runtime/index.ts | 1 + src/runtime/observe.ts | 21 +++-- 4 files changed, 197 insertions(+), 7 deletions(-) create mode 100644 bench/src/eops-gepa.mts diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts index c91f5e3..69b6a48 100644 --- a/bench/src/agentic.ts +++ b/bench/src/agentic.ts @@ -89,6 +89,9 @@ export interface AgenticOptions { temperature?: number /** Turns the agent may take within ONE shot before the driver intervenes. */ innerTurns?: number + /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a + * prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */ + analystInstruction?: string } // ── The unit: one agentic shot (a bounded tool loop) over a handle ─────────────── @@ -187,7 +190,7 @@ async function analyze(task: AgenticTask, messages: Msg[], opts: AgenticOptions) const chat = createChatClient({ transport: 'router', apiKey: opts.routerKey, baseUrl: opts.routerBaseUrl, defaultModel: opts.model }) const obs = await observe( { task: task.userPrompt, output: trajectory, trace: messages, outcome: 'failed' }, - { chat, model: opts.model }, + { chat, model: opts.model, ...(opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {}) }, ) // The steer = the analyst's recommended actions for the agent. Empty ⇒ nothing left to do. const steer = obs.findings diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts new file mode 100644 index 0000000..5ce082a --- /dev/null +++ b/bench/src/eops-gepa.mts @@ -0,0 +1,177 @@ +/** + * GEPA over the ANALYST/STEERER prompt — the flywheel, on the CANONICAL loop system. + * + * The analyst IS the steerer: observe()'s system instruction turns an agent's trace into + * the recommended_action that steers the next depth shot. This evolves THAT instruction + * against the live EOPS gate, using agent-eval's GEPA primitives (NOT a hand-rolled loop): + * - buildReflectionPrompt / parseReflectionResponse — the reflective mutation (GEPA brain) + * - paretoFrontier — non-dominated selection over [maximize lift, minimize cost] + * + * FITNESS = the depth-vs-breadth lift on the canonical stack (Supervisor + observe()): for a + * candidate analyst instruction, run depth (steered by it) on each task and subtract the + * SHARED breadth baseline (computed ONCE per task — breadth has no analyst). The failing + * tasks (low per-task lift) are the gradient the reflection reads. + * + * docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest + * EOPS_GYM_DBS_DIR= TANGLE_API_KEY=… \ + * N=4 GENS=2 CHILDREN=2 MAXSHOTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-gepa.mts + */ +import { readFileSync, writeFileSync } from 'node:fs' +import { buildReflectionPrompt, paretoFrontier, parseReflectionResponse } from '@tangle-network/agent-eval' +import { defaultAnalystInstruction } from '@tangle-network/agent-runtime/loops' +import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic' +import { createEopsSurface, eopsTaskFromRow } from './agentic-eops' +import { type RouterConfig, routerChatWithUsage } from './router-client' + +function must(name: string): string { + const v = process.env[name] + if (!v) throw new Error(`env ${name} is required`) + return v +} + +async function loadItsmTasks(n: number): Promise { + const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent('ServiceNow-AI/EnterpriseOps-Gym')}&config=oracle&split=itsm&offset=0&length=${n}` + const res = await fetch(url) + if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}`) + const body = (await res.json()) as { rows?: Array<{ row: Parameters[0] }> } + return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row)) +} + +interface Candidate { + id: string + instruction: string + gen: number + lift?: number + cost?: number + perTask?: Array<{ id: string; lift: number }> +} + +const pct = (x: number) => `${(x * 100).toFixed(0)}%` +const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + +async function main(): Promise { + const n = Number(process.env.N ?? 4) + const gens = Number(process.env.GENS ?? 2) + const childCount = Number(process.env.CHILDREN ?? 2) + const parents = Number(process.env.PARENTS ?? 2) + const maxShots = Number(process.env.MAXSHOTS ?? 3) + const width = Number(process.env.WIDTH ?? 3) + const model = process.env.WORKER_MODEL ?? 'deepseek-v4-pro' + const routerKey = must('TANGLE_API_KEY') + const routerBaseUrl = process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1' + const opts: AgenticOptions = { routerBaseUrl, routerKey, model, innerTurns: Number(process.env.INNER_TURNS ?? 4), temperature: 0.7 } + const reflectCfg: RouterConfig = { routerBaseUrl, routerKey, model: process.env.REFLECT_MODEL ?? model } + const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR')) + + const tasks = await loadItsmTasks(n) + console.error(`=== GEPA over the analyst prompt · ${tasks.length} EOPS tasks · ${model} · gens=${gens} children=${childCount} ===\n`) + + // Shared breadth baseline per task (no analyst — same for every candidate). Compute ONCE. + console.error('▶ computing shared breadth baseline (once per task)…') + const breadthByTask = new Map() + for (const task of tasks) { + let breadthScore = 0 + let cB = 0 + for (let w = 0; w < width && cB < maxShots * (opts.innerTurns ?? 4); w += 1) { + const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: 1 }) + cB += b.completions + if (b.score > breadthScore) breadthScore = b.score + } + breadthByTask.set(task.id, { score: breadthScore, comps: cB }) + console.error(` ${task.id.slice(-12)}: breadth ${pct(breadthScore)}`) + } + + // Fitness: depth (steered by the candidate instruction) − the shared breadth baseline. + async function fitness(instruction: string): Promise<{ lift: number; cost: number; perTask: Array<{ id: string; lift: number }> }> { + let liftSum = 0 + let cost = 0 + const perTask: Array<{ id: string; lift: number }> = [] + for (const task of tasks) { + const depth = await runAgentic({ ...opts, analystInstruction: instruction, surface, task, mode: 'depth', budget: maxShots }) + const b = breadthByTask.get(task.id)?.score ?? 0 + const taskLift = depth.score - b + liftSum += taskLift + cost += depth.completions + perTask.push({ id: task.id, lift: taskLift }) + } + return { lift: liftSum / tasks.length, cost: cost / tasks.length, perTask } + } + + // Seed population: the PROVEN baseline (observe()'s default — the +16.4pp instruction) + // FIRST, so GEPA improves from known-good, then the designer-panel steerer prompts. + const popSeeds = (JSON.parse(readFileSync('steerers/eops-itsm-population.json', 'utf8')) as Array<{ id: string; systemPrompt: string }>) + .slice(0, Math.max(0, Number(process.env.SEEDS ?? 4) - 1)) + .map((s) => ({ id: `seed:${s.id}`, instruction: s.systemPrompt, gen: 0 })) + const pop: Candidate[] = [{ id: 'seed:observe-default', instruction: defaultAnalystInstruction, gen: 0 }, ...popSeeds] + + const objectives = [ + { name: 'lift', direction: 'maximize' as const, value: (c: Candidate) => c.lift ?? -1 }, + { name: 'cost', direction: 'minimize' as const, value: (c: Candidate) => c.cost ?? 1e9 }, + ] + + for (let gen = 0; gen <= gens; gen += 1) { + console.error(`\n── generation ${gen} · scoring ${pop.filter((c) => c.lift === undefined).length} new candidate(s)`) + for (const c of pop) { + if (c.lift !== undefined) continue + const f = await fitness(c.instruction) + c.lift = f.lift + c.cost = f.cost + c.perTask = f.perTask + console.error(` ${c.id.padEnd(28)} lift ${pp(c.lift)} cost ${c.cost.toFixed(1)}`) + } + const ranked = [...pop].filter((c) => c.lift !== undefined).sort((a, b) => (b.lift ?? -1) - (a.lift ?? -1)) + console.error(` gen ${gen} best: ${ranked[0]?.id} @ ${pp(ranked[0]?.lift ?? 0)}`) + if (gen === gens) break + + // Pareto-select parents (lift up, cost down), then reflective-mutate each. + const frontier = paretoFrontier(pop.filter((c) => c.lift !== undefined), objectives).frontier.slice(0, parents) + const children: Candidate[] = [] + for (const parent of frontier) { + const sorted = [...(parent.perTask ?? [])].sort((a, b) => b.lift - a.lift) + const top = sorted.slice(0, 2).map((t) => ({ id: t.id.slice(-12), score: t.lift })) + const bottom = sorted.slice(-2).map((t) => ({ id: t.id.slice(-12), score: t.lift })) + const rp = buildReflectionPrompt({ + target: + 'The SYSTEM INSTRUCTION for a trace-analyst that reads an agent\'s tool-call trace on an unfinished IT-ops task and outputs ONE concrete corrective instruction to steer the next attempt. It must never see the grader. payload MUST be the full replacement instruction string.', + parentPayload: parent.instruction, + topTrials: top, + bottomTrials: bottom, + childCount, + mutationPrimitives: [ + 'Make the diagnosis more specific (name the exact record/field/target value still wrong).', + 'Add an anti-degradation rule (freeze already-correct records; do not re-touch them).', + 'Tighten the stop condition (when to declare done vs keep acting).', + 'Add a verify-before-mutate step (read current state, then change only what is wrong).', + ], + }) + const resp = await routerChatWithUsage(reflectCfg, [{ role: 'user', content: rp }], { temperature: 0.8 }) + const proposals = parseReflectionResponse(resp.content, childCount) + for (const p of proposals) { + const instruction = typeof p.payload === 'string' ? p.payload : JSON.stringify(p.payload) + if (instruction.trim().length < 40) continue // reject degenerate mutations + children.push({ id: `g${gen + 1}:${parent.id.replace(/^seed:|^g\d+:/, '')}-${children.length}`, instruction, gen: gen + 1 }) + } + } + console.error(` reflective-mutated ${frontier.length} parent(s) → ${children.length} child(ren)`) + // Elitism: carry the frontier forward (already scored) + the new children. + pop.length = 0 + pop.push(...frontier, ...children) + } + + const scored = pop.filter((c) => c.lift !== undefined).sort((a, b) => (b.lift ?? -1) - (a.lift ?? -1)) + const best = scored[0] + console.error(`\n${'='.repeat(72)}`) + console.error(`GEPA RESULT · ${tasks.length} tasks · ${model}`) + console.error('='.repeat(72)) + for (const c of scored) console.error(` ${c.id.padEnd(30)} gen${c.gen} lift ${pp(c.lift ?? 0)} cost ${(c.cost ?? 0).toFixed(1)}`) + console.error(`\n WINNER: ${best?.id} @ lift ${pp(best?.lift ?? 0)} (gen ${best?.gen})`) + const out = { model, tasks: tasks.length, gens, best: { id: best?.id, gen: best?.gen, lift: best?.lift, instruction: best?.instruction }, all: scored.map((c) => ({ id: c.id, gen: c.gen, lift: c.lift, cost: c.cost })) } + const outPath = process.env.OUT ?? '/tmp/eops-gepa-result.json' + writeFileSync(outPath, JSON.stringify(out, null, 2)) + console.error(` best instruction + ranking → ${outPath}`) +} + +main().catch((e) => { + console.error(`eops-gepa: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`) + process.exit(1) +}) diff --git a/src/runtime/index.ts b/src/runtime/index.ts index 2e50fe7..21e779f 100644 --- a/src/runtime/index.ts +++ b/src/runtime/index.ts @@ -63,6 +63,7 @@ export { // The third-person observer: a worker's trace → trace-grounded findings, an // operator report, and durable corpus facts for the next run (the closed loop). export { + defaultAnalystInstruction, type Observation, type ObserveInput, type ObserveOptions, diff --git a/src/runtime/observe.ts b/src/runtime/observe.ts index 44aaccd..9fb31a4 100644 --- a/src/runtime/observe.ts +++ b/src/runtime/observe.ts @@ -45,8 +45,22 @@ export interface ObserveOptions { signal?: AbortSignal /** Cap the trace lines fed to the observer (keeps the call cheap). Default 80. */ maxTraceLines?: number + /** Override the analyst's system instruction — the prompt that turns a trace into + * findings + recommended_actions. The analyst IS the steerer, so this is the knob a + * prompt optimizer (GEPA) tunes. Omitted ⇒ the default observer instruction. The + * firewall (trace-only, never the verdict) is structural (input has no score), so a + * custom instruction cannot break it. */ + analystInstruction?: string } +/** The default observer instruction — exported so an optimizer can seed its population. */ +export const defaultAnalystInstruction = + 'You are a third-person OBSERVER watching an AI agent work. You see its TRACE (what it did), not its grader. ' + + 'From the trace, name SPECIFIC, behavior-grounded findings: wasted/duplicated tool calls, thrash/retries, ' + + 'token/cost waste, missing verification, failure patterns. For each, a concrete recommended_action, and ' + + 'whether the AGENT (fix its skills/prompt/tools) or the OPERATOR (fix framing/decomposition/config) should act. ' + + 'Only claim what the trace shows. No findings if the run was clean.' + export interface Observation { findings: AnalystFinding[] /** Facts persisted to the corpus (empty when no corpus was supplied). */ @@ -131,12 +145,7 @@ export async function observe(input: ObserveInput, opts: ObserveOptions): Promis messages: [ { role: 'system', - content: - 'You are a third-person OBSERVER watching an AI agent work. You see its TRACE (what it did), not its grader. ' + - 'From the trace, name SPECIFIC, behavior-grounded findings: wasted/duplicated tool calls, thrash/retries, ' + - 'token/cost waste, missing verification, failure patterns. For each, a concrete recommended_action, and ' + - 'whether the AGENT (fix its skills/prompt/tools) or the OPERATOR (fix framing/decomposition/config) should act. ' + - 'Only claim what the trace shows. No findings if the run was clean.', + content: opts.analystInstruction ?? defaultAnalystInstruction, }, { role: 'user', From ea4845f37a317f534932043f6dc5e7a4d2055710 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 05:24:29 -0600 Subject: [PATCH 2/8] fix(bench): GEPA harness survives gym/router infra blips (skip failed tasks) The first real run died when the (long-lived) gym container wedged: breadth baselines returned 0% then runAgentic threw 'every rollout went down', killing the whole GEPA run. runAgentic is fail-loud; the GEPA loop now catches per-task: a task whose rollouts fail is SKIPPED (not fatal), both in the breadth precompute and the depth fitness. Fails loud only if <2 tasks survive (genuine infra-down). Pair with a fresh gym container + WIDTH<=2. --- bench/src/eops-gepa.mts | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/bench/src/eops-gepa.mts b/bench/src/eops-gepa.mts index 5ce082a..821bd59 100644 --- a/bench/src/eops-gepa.mts +++ b/bench/src/eops-gepa.mts @@ -67,34 +67,48 @@ async function main(): Promise { console.error(`=== GEPA over the analyst prompt · ${tasks.length} EOPS tasks · ${model} · gens=${gens} children=${childCount} ===\n`) // Shared breadth baseline per task (no analyst — same for every candidate). Compute ONCE. + // Resilient: a task whose rollouts all fail (transient gym/router infra) is SKIPPED, not + // fatal — runAgentic is fail-loud, so we catch + drop the task and press on. console.error('▶ computing shared breadth baseline (once per task)…') const breadthByTask = new Map() + const liveTasks: AgenticTask[] = [] for (const task of tasks) { - let breadthScore = 0 - let cB = 0 - for (let w = 0; w < width && cB < maxShots * (opts.innerTurns ?? 4); w += 1) { - const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: 1 }) - cB += b.completions - if (b.score > breadthScore) breadthScore = b.score + try { + let breadthScore = 0 + let cB = 0 + for (let w = 0; w < width && cB < maxShots * (opts.innerTurns ?? 4); w += 1) { + const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: 1 }) + cB += b.completions + if (b.score > breadthScore) breadthScore = b.score + } + breadthByTask.set(task.id, { score: breadthScore, comps: cB }) + liveTasks.push(task) + console.error(` ${task.id.slice(-12)}: breadth ${pct(breadthScore)}`) + } catch (e) { + console.error(` ${task.id.slice(-12)}: SKIP (${e instanceof Error ? e.message.slice(0, 70) : e})`) } - breadthByTask.set(task.id, { score: breadthScore, comps: cB }) - console.error(` ${task.id.slice(-12)}: breadth ${pct(breadthScore)}`) } + if (liveTasks.length < 2) throw new Error(`only ${liveTasks.length} task(s) survived breadth baseline — gym/router infra is down (restart the gym container)`) // Fitness: depth (steered by the candidate instruction) − the shared breadth baseline. async function fitness(instruction: string): Promise<{ lift: number; cost: number; perTask: Array<{ id: string; lift: number }> }> { let liftSum = 0 let cost = 0 const perTask: Array<{ id: string; lift: number }> = [] - for (const task of tasks) { - const depth = await runAgentic({ ...opts, analystInstruction: instruction, surface, task, mode: 'depth', budget: maxShots }) - const b = breadthByTask.get(task.id)?.score ?? 0 - const taskLift = depth.score - b - liftSum += taskLift - cost += depth.completions - perTask.push({ id: task.id, lift: taskLift }) + let scored = 0 + for (const task of liveTasks) { + try { + const depth = await runAgentic({ ...opts, analystInstruction: instruction, surface, task, mode: 'depth', budget: maxShots }) + const b = breadthByTask.get(task.id)?.score ?? 0 + liftSum += depth.score - b + cost += depth.completions + perTask.push({ id: task.id, lift: depth.score - b }) + scored += 1 + } catch (e) { + console.error(` depth SKIP ${task.id.slice(-12)} (${e instanceof Error ? e.message.slice(0, 50) : e})`) + } } - return { lift: liftSum / tasks.length, cost: cost / tasks.length, perTask } + return { lift: scored ? liftSum / scored : -1, cost: scored ? cost / scored : 1e9, perTask } } // Seed population: the PROVEN baseline (observe()'s default — the +16.4pp instruction) From dfca5406efbf009eef622d8edff14cb7d63f7b8f Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 05:31:32 -0600 Subject: [PATCH 3/8] =?UTF-8?q?refactor(bench):=20delete=20eops-gate.mts?= =?UTF-8?q?=20=E2=80=94=20the=20throwaway=20flat-loop=20prototype=20(?= =?UTF-8?q?=E2=88=92433=20LOC)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It was a dead-end (nothing imports it): a hand-rolled flat loop that BYPASSED the canonical Supervisor + a second copy of the gym client (6 functions duplicating gym-agent.ts's 5). Fully superseded by the canonical stack — agentic.ts (domain-blind depth/breadth/Supervisor/observe, 428 LOC, written ONCE) + the AgenticSurface seam (agentic-eops.ts, 73 LOC = the entire per-domain slot-in). The +16.4pp result and the GEPA harness run on the canonical path; this prototype only de-risked the plumbing (gym standup, router-tools worker, depth-best scoring) and is now dead weight. --- bench/src/eops-gate.mts | 433 ---------------------------------------- 1 file changed, 433 deletions(-) delete mode 100644 bench/src/eops-gate.mts diff --git a/bench/src/eops-gate.mts b/bench/src/eops-gate.mts deleted file mode 100644 index 355c18e..0000000 --- a/bench/src/eops-gate.mts +++ /dev/null @@ -1,433 +0,0 @@ -/** - * EnterpriseOps-Gym depth-vs-breadth gate — the agentic, stateful domain where - * steering is hypothesized to beat compute (the opposite regime to HumanEval, - * where breadth/resampling won). The worker is the TOOL-USING router backend - * (`routerToolLoop`): it calls the gym's live MCP tools, sees the results, and - * acts — off-box (router inference + host→gym HTTP), no sandbox. - * - * breadth@K — K independent shots, each a short agentic loop on its OWN fresh - * seeded DB; keep the best by the deployable verifier (resample). - * depth@K — ONE sustained agentic loop over ONE DB, ~K× the turn budget; the - * artifact (DB state) accumulates, so each action conditions the next. - * - * Equal compute = equal total inference turns (K·M). Score = the task's own SQL - * verifiers (deployable check), run on the final DB state. Per-task {0,1} resolved, - * paired 95% bootstrap CI. - * - * Stand up first: - * docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest - * # gym_dbs.zip from github.com/ServiceNow/EnterpriseOps-Gym (root), unzipped: - * export EOPS_GYM_DBS_DIR=/path/to/unzipped/dbs - * TANGLE_API_KEY=… N=20 K=3 M=5 WORKER_MODEL=gpt-4o-mini tsx src/eops-gate.mts - */ -import { readFileSync, writeFileSync } from 'node:fs' -import { join } from 'node:path' -import { type RouterConfig, type ToolSpec, routerChatWithUsage, routerToolLoop } from './router-client' -import { type PairedLift, pairedLift, pool } from './stats.mts' - -function must(name: string): string { - const v = process.env[name] - if (!v) throw new Error(`env ${name} is required`) - return v -} - -const dataset = 'ServiceNow-AI/EnterpriseOps-Gym' - -type ComparisonType = 'equals' | 'greater_than' | 'less_than' | 'contains' -interface Verifier { - verifier_type?: string - gym_name?: string - /** EOPS nests the deterministic check here; comparison_type defaults to 'equals'. */ - validation_config?: { query?: string; expected_value?: unknown; comparison_type?: ComparisonType } -} -interface GymServer { - mcp_server_url: string - seed_database_file: string - context?: Record -} -export interface EopsTask { - taskId: string - systemPrompt: string - userPrompt: string - selectedTools: string[] - servers: GymServer[] - verifiers: Verifier[] -} - -const asArray = (v: unknown): T[] => (typeof v === 'string' ? JSON.parse(v) : v) as T[] - -/** Pull itsm tasks from the HF rows server (the oracle tool-set config). Fail loud. */ -export async function loadTasks(n: number, offset: number): Promise { - const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent(dataset)}&config=oracle&split=itsm&offset=${offset}&length=${n}` - const res = await fetch(url) - if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}: ${url}`) - const data = (await res.json()) as { rows?: Array<{ row: Record }> } - const rows = data.rows ?? [] - if (rows.length === 0) throw new Error('EOPS HF returned 0 rows') - return rows.map(({ row }) => ({ - taskId: String(row.task_id), - systemPrompt: String(row.system_prompt ?? ''), - userPrompt: String(row.user_prompt ?? ''), - selectedTools: asArray(row.selected_tools), - servers: asArray(row.gym_servers_config), - verifiers: asArray(row.verifiers), - })) -} - -// ── gym client (mirrors scripts/enterpriseops_gym_judge.py) ──────────────────── - -function authHeaders(server: GymServer, dbId: string): Record { - return { 'content-type': 'application/json', ...(server.context ?? {}), 'x-database-id': dbId } -} - -/** POST and parse a JSON body OR the last `data:` line of an SSE stream (/mcp streams SSE). - * Retries a THROWN fetch (transient network / connection-reset / router throttle under - * concurrency — surfaces as "fetch failed") with backoff, so a momentary blip doesn't - * drop a task. HTTP-status handling stays with the caller (seed retries on 500). */ -async function postJson(url: string, body: unknown, headers: Record): Promise<{ status: number; json: unknown }> { - let lastErr: unknown - for (let attempt = 0; attempt < 4; attempt += 1) { - try { - const r = await fetch(url, { method: 'POST', headers, body: JSON.stringify(body) }) - const text = await r.text() - const dataLines = text.split('\n').filter((l) => l.startsWith('data:')).map((l) => l.slice(5).trim()) - const payload = dataLines.length ? dataLines[dataLines.length - 1] : text - try { - return { status: r.status, json: JSON.parse(payload ?? 'null') } - } catch { - return { status: r.status, json: text } - } - } catch (err) { - lastErr = err - await new Promise((res) => setTimeout(res, 1000 * (attempt + 1))) - } - } - throw new Error(`postJson ${url} failed after 4 attempts: ${lastErr instanceof Error ? lastErr.message : String(lastErr)}`) -} - -async function seedDb(server: GymServer, dbsDir: string): Promise { - const sql = readFileSync(join(dbsDir, server.seed_database_file), 'utf8') - const url = `${server.mcp_server_url.replace(/\/$/, '')}/api/seed-database` - // The gym's SQLite exhausts file handles under concurrency ("unable to open - // database file", HTTP 500) — TRANSIENT: it clears as sibling DBs are deleted. - // Bounded retry with backoff so a momentary limit doesn't drop the task's data. - let lastErr = '' - for (let attempt = 0; attempt < 5; attempt += 1) { - const dbId = `gate_${Math.random().toString(36).slice(2, 12)}` - const { status, json } = await postJson(url, { database_id: dbId, name: `gate_${dbId}`, description: 'gate', sql_content: sql }, { 'content-type': 'application/json' }) - if (status === 200 && (json as { success?: boolean })?.success) return dbId - lastErr = `(${status}): ${JSON.stringify(json).slice(0, 160)}` - await new Promise((r) => setTimeout(r, 1500 * (attempt + 1))) - } - throw new Error(`seed-database failed after 5 attempts ${lastErr}`) -} - -async function deleteDb(server: GymServer, dbId: string): Promise { - await fetch(`${server.mcp_server_url.replace(/\/$/, '')}/api/delete-database`, { - method: 'DELETE', - headers: { 'content-type': 'application/json' }, - body: JSON.stringify({ database_id: dbId }), - }).catch(() => {}) -} - -/** Coerce an MCP inputSchema to an OpenAI-tool-valid top-level object schema. The - * router rejects top-level oneOf/anyOf/allOf/enum/not — keep the properties (nested - * combinators are fine) but guarantee a plain `{type:'object'}` head. */ -function sanitizeSchema(s: unknown): { type: 'object'; properties: Record; required?: string[] } { - const o = s && typeof s === 'object' ? (s as Record) : {} - const banned = o.oneOf || o.anyOf || o.allOf || o.not || o.enum - if (o.type === 'object' && !banned && o.properties && typeof o.properties === 'object') { - return { type: 'object', properties: o.properties as Record, ...(Array.isArray(o.required) ? { required: o.required as string[] } : {}) } - } - return { type: 'object', properties: {} } -} - -/** Build OpenAI-shape tool specs for the task's selected tools from the gym's MCP tools/list. */ -async function toolSpecs(server: GymServer, dbId: string, selected: string[]): Promise { - const url = `${server.mcp_server_url.replace(/\/$/, '')}/mcp` - const { json } = await postJson(url, { jsonrpc: '2.0', id: 1, method: 'tools/list', params: {} }, authHeaders(server, dbId)) - const all = ((json as { result?: { tools?: Array<{ name: string; description?: string; inputSchema?: unknown }> } }).result?.tools) ?? [] - const want = new Set(selected) - return all - .filter((t) => want.has(t.name)) - .map((t) => ({ type: 'function' as const, function: { name: t.name, description: (t.description ?? '').slice(0, 1000), parameters: sanitizeSchema(t.inputSchema) } })) -} - -async function callTool(server: GymServer, dbId: string, name: string, args: Record): Promise { - const url = `${server.mcp_server_url.replace(/\/$/, '')}/mcp` - const { json } = await postJson(url, { jsonrpc: '2.0', id: 2, method: 'tools/call', params: { name, arguments: args } }, authHeaders(server, dbId)) - const result = (json as { result?: { content?: Array<{ text?: string }>; isError?: boolean }; error?: unknown }) ?? {} - if (result.error) return `error: ${JSON.stringify(result.error).slice(0, 300)}` - const text = result.result?.content?.map((c) => c.text ?? '').join('\n') ?? JSON.stringify(result.result ?? json) - return text.slice(0, 1500) -} - -function compare(actual: unknown, expected: unknown, kind: ComparisonType): boolean { - const fa = Number(actual) - const fe = Number(expected) - const numeric = !Number.isNaN(fa) && !Number.isNaN(fe) - if (kind === 'equals') return numeric ? fa === fe : String(actual) === String(expected) - if (kind === 'greater_than') return numeric && fa > fe - if (kind === 'less_than') return numeric && fa < fe - if (kind === 'contains') return String(actual).includes(String(expected)) - throw new Error(`unsupported comparison_type ${kind}`) -} - -/** Run the task's SQL verifiers on the final DB state; resolved = all pass. */ -async function score(server: GymServer, dbId: string, verifiers: Verifier[]): Promise<{ passes: number; total: number; resolved: boolean }> { - // Only deterministic database_state verifiers are scoreable (the judge rejects others). - const dbv = verifiers.filter((v) => (v.verifier_type ?? 'database_state') === 'database_state' && v.validation_config?.query) - let passes = 0 - for (const v of dbv) { - const vc = v.validation_config as NonNullable - const url = `${server.mcp_server_url.replace(/\/$/, '')}/api/sql-runner` - const { json } = await postJson(url, { query: vc.query, database_id: dbId }, authHeaders(server, dbId)) - const out = json as { data?: Array>; rows?: Array>; error?: unknown } - if (out.error) continue - const first = (out.data ?? out.rows ?? [])[0] - const actual = first && typeof first === 'object' ? Object.values(first)[0] : first - if (compare(actual, vc.expected_value, vc.comparison_type ?? 'equals')) passes += 1 - } - return { passes, total: dbv.length, resolved: dbv.length > 0 && passes === dbv.length } -} - -// ── one agentic shot: the tool-using worker acts on a (seeded) DB ────────────── - -function shotPrompt(task: EopsTask, steer?: string): string { - return [ - task.userPrompt, - '', - 'Use the available tools to investigate the current state, then take the actions needed to complete the task.', - 'Inspect before you mutate. When you are confident the task is complete, give a one-line summary and stop calling tools.', - ...(steer ? ['', `CORRECTION FROM YOUR PRIOR ATTEMPT: ${steer}`] : []), - ].join('\n') -} - -type ToolTrace = Array<{ name: string; args: string; result: string }> - -async function runShot(cfg: RouterConfig, task: EopsTask, server: GymServer, dbId: string, tools: ToolSpec[], maxTurns: number, steer?: string): Promise<{ toolCalls: number; toolTrace: ToolTrace }> { - const r = await routerToolLoop( - cfg, - task.systemPrompt || 'You are an IT service-management operations agent.', - shotPrompt(task, steer), - tools, - async (name, args) => callTool(server, dbId, name, args as Record), - { maxTurns, temperature: 0.3 }, - ) - return { toolCalls: r.toolCalls, toolTrace: r.toolTrace } -} - -type Score = { passes: number; total: number; resolved: boolean } -const scoreRatio = (x: Score) => x.passes / Math.max(x.total, 1) - -const genericNudge = - 'Re-inspect the current state with the read tools, identify what the task still requires, and complete it. Do not stop until every required change is verified in place.' - -/** A depth steerer under test. No template ⇒ the fixed generic nudge. With a template - * ⇒ an LLM steerer: {task}/{trace} are substituted; it reads BEHAVIOR only (firewalled). */ -export interface Steerer { - id: string - systemPrompt?: string - userTemplate?: string -} - -function traceSummary(trace: ToolTrace): string { - return trace.map((t) => `${t.name}(${t.args.slice(0, 140)}) -> ${t.result.slice(0, 180)}`).join('\n').slice(-4000) || '(no tool calls yet)' -} - -/** The next-shot instruction for a steerer. FIREWALLED: trace only, never verifiers. */ -async function steerInstruction(steerCfg: RouterConfig, steerer: Steerer, task: EopsTask, trace: ToolTrace): Promise { - if (!steerer.userTemplate) return genericNudge - const user = steerer.userTemplate.replaceAll('{task}', task.userPrompt).replaceAll('{trace}', traceSummary(trace)) - const r = await routerChatWithUsage( - steerCfg, - [ - { role: 'system', content: steerer.systemPrompt ?? 'You are an ITSM operations reviewer. Output one concrete corrective instruction.' }, - { role: 'user', content: user }, - ], - { temperature: 0.2 }, - ) - return r.content.trim() -} - -/** One depth arm: K sequential steered shots over ONE persistent DB, scored after each - * shot. Returns the best checkpoint (deployable, symmetric with breadth's best-of-K), - * the final state, the trajectory, and tool-call count. */ -async function runDepthArm( - cfg: RouterConfig, - steerCfg: RouterConfig, - steerer: Steerer, - task: EopsTask, - server: GymServer, - dbsDir: string, - k: number, - m: number, -): Promise<{ best: Score; final: Score; traj: string; toolCalls: number }> { - const dbId = await seedDb(server, dbsDir) - let toolCalls = 0 - try { - const tools = await toolSpecs(server, dbId, task.selectedTools) - const trace: ToolTrace = [] - const shots: Score[] = [] - for (let s = 0; s < k; s += 1) { - const steer = s === 0 ? undefined : await steerInstruction(steerCfg, steerer, task, trace) - const sr = await runShot(cfg, task, server, dbId, tools, m, steer) - toolCalls += sr.toolCalls - trace.push(...sr.toolTrace) - shots.push(await score(server, dbId, task.verifiers)) - } - const final = shots[shots.length - 1] ?? { passes: 0, total: 1, resolved: false } - const best = shots.reduce((a, b) => (scoreRatio(b) > scoreRatio(a) ? b : a), shots[0] ?? final) - return { best, final, traj: shots.map((x) => `${x.passes}/${x.total}`).join('→'), toolCalls } - } finally { - await deleteDb(server, dbId) - } -} - -/** The built-in inline analyst (STEER=analyst back-compat / the S1 baseline). */ -const inlineAnalyst: Steerer = { - id: 'analyst', - systemPrompt: - "You are a senior ITSM operations reviewer. You are shown an agent's tool-call trace on a task it has NOT completed. Diagnose precisely what the task still requires and issue ONE concrete corrective instruction — name the specific records, fields, and target values to set. Do not restate the task, do not praise, do not summarize the trace. Output only the single next instruction.", - userTemplate: 'TASK:\n{task}\n\nAGENT TRACE SO FAR:\n{trace}\n\nThe single most important still-missing or incorrect step, as one concrete instruction:', -} - -const pct = (x: number) => `${(x * 100).toFixed(1)}%` -const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` - -export interface SteererRank { - id: string - bestRate: number - lift: PairedLift - liftRes: PairedLift - degradation: number -} -export interface SteererLoss { - steererId: string - userPrompt: string - breadth: number - depthBest: number - traj: string -} -export interface EvalResult { - ok: Array<{ taskId: string; userPrompt: string; breadthR: number; breadthRes: number; perSteerer: Record }> - excluded: number - ranked: SteererRank[] - losses: SteererLoss[] -} - -/** The fitness function — runs every steerer as a depth arm against ONE shared breadth - * baseline per task, returns ranked lift + the per-steerer LOSSES (tasks where depth-best - * < breadth, with the trajectory). The losses are GEPA's reflection fuel. */ -export async function evaluateSteerers(args: { - cfg: RouterConfig - steerCfg: RouterConfig - steerers: Steerer[] - tasks: EopsTask[] - dbsDir: string - k: number - m: number - concurrency: number -}): Promise { - const { cfg, steerCfg, steerers, tasks, dbsDir, k, m, concurrency } = args - const rows = await pool(tasks, concurrency, async (task, i) => { - const server = task.servers[0] - if (!server) return null - try { - const breadthScores: Score[] = [] - for (let s = 0; s < k; s += 1) { - const dbId = await seedDb(server, dbsDir) - try { - const tools = await toolSpecs(server, dbId, task.selectedTools) - await runShot(cfg, task, server, dbId, tools, m) - breadthScores.push(await score(server, dbId, task.verifiers)) - } finally { - await deleteDb(server, dbId) - } - } - const breadthBest = breadthScores.reduce((a, b) => (scoreRatio(b) > scoreRatio(a) ? b : a), breadthScores[0] ?? { passes: 0, total: 1, resolved: false }) - const perSteerer: Record = {} - const tags: string[] = [] - for (const st of steerers) { - const arm = await runDepthArm(cfg, steerCfg, st, task, server, dbsDir, k, m) - perSteerer[st.id] = { bestR: scoreRatio(arm.best), finalR: scoreRatio(arm.final), bestRes: arm.best.resolved ? 1 : 0, traj: arm.traj } - tags.push(`${st.id}=${arm.best.passes}/${arm.best.total}`) - } - process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: breadth=${breadthBest.passes}/${breadthBest.total} | ${tags.join(' ')}\n`) - return { taskId: task.taskId, userPrompt: task.userPrompt, breadthR: scoreRatio(breadthBest), breadthRes: breadthBest.resolved ? 1 : 0, perSteerer } - } catch (err) { - process.stderr.write(` [${i + 1}/${tasks.length}] ${task.taskId.slice(-12)}: SKIP (${err instanceof Error ? err.message.slice(0, 90) : String(err)})\n`) - return null - } - }) - const ok = rows.filter((r): r is NonNullable => r !== null) - const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1) - const breadthR = ok.map((r) => r.breadthR) - const breadthRes = ok.map((r) => r.breadthRes) - const ranked = steerers - .map((st) => { - const bestR = ok.map((r) => r.perSteerer[st.id]?.bestR ?? 0) - const finalR = ok.map((r) => r.perSteerer[st.id]?.finalR ?? 0) - const bestRes = ok.map((r) => r.perSteerer[st.id]?.bestRes ?? 0) - return { id: st.id, bestRate: rate(bestR), lift: pairedLift(breadthR, bestR), liftRes: pairedLift(breadthRes, bestRes), degradation: rate(bestR) - rate(finalR) } - }) - .sort((a, b) => b.lift.point - a.lift.point) - // Losses = the reflection fuel: tasks where a steerer's depth-best lost to breadth. - const losses: SteererLoss[] = [] - for (const r of ok) { - for (const st of steerers) { - const ps = r.perSteerer[st.id] - if (ps && ps.bestR < r.breadthR) losses.push({ steererId: st.id, userPrompt: r.userPrompt, breadth: r.breadthR, depthBest: ps.bestR, traj: ps.traj }) - } - } - return { ok, excluded: rows.length - ok.length, ranked, losses } -} - -async function main(): Promise { - const n = Number(process.env.N ?? 20) - const k = Number(process.env.K ?? 3) - const m = Number(process.env.M ?? 5) - const offset = Number(process.env.OFFSET ?? 0) - const model = process.env.WORKER_MODEL ?? 'gpt-4o-mini' - const dbsDir = must('EOPS_GYM_DBS_DIR') - const cfg: RouterConfig = { routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', routerKey: must('TANGLE_API_KEY'), model } - const concurrency = Number(process.env.CONCURRENCY ?? 4) - const steerCfg: RouterConfig = { ...cfg, model: process.env.STEER_MODEL ?? model } - // The steerer population under test: generic (control) + either a STEERERS_FILE - // (JSON array of {id,systemPrompt,userTemplate}) or the built-in inline analyst. - const steerers: Steerer[] = [{ id: 'generic' }] - if (process.env.STEERERS_FILE) steerers.push(...(JSON.parse(readFileSync(process.env.STEERERS_FILE, 'utf8')) as Steerer[])) - else if (process.env.STEER === 'analyst') steerers.push(inlineAnalyst) - - console.log(`=== EOPS steerer sweep · worker=${model} · steerer=${steerCfg.model} · N=${n} K=${k} M=${m} ===`) - console.log(` steerers (depth arms, all vs ONE shared breadth baseline): ${steerers.map((s) => s.id).join(', ')}`) - const tasks = await loadTasks(n, offset) - console.log(` loaded ${tasks.length} itsm task(s); each scored depth-BEST (checkpoint) vs breadth best-of-K, conc=${concurrency}\n`) - - const { ok, excluded, ranked } = await evaluateSteerers({ cfg, steerCfg, steerers, tasks, dbsDir, k, m, concurrency }) - const breadthR = ok.map((r) => r.breadthR) - const breadthRes = ok.map((r) => r.breadthRes) - const rate = (xs: number[]) => xs.reduce((s, x) => s + x, 0) / Math.max(xs.length, 1) - const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.') - - console.log(`\n${'='.repeat(86)}`) - console.log(`RESULTS · EOPS steerer sweep · n=${ok.length} (excluded ${excluded}) · K=${k} M=${m} · worker=${model} · steerer=${steerCfg.model}`) - console.log('='.repeat(86)) - console.log(` breadth@${k} (best-of-K, shared baseline): score ${pct(rate(breadthR))} resolved ${pct(rate(breadthRes))}\n`) - console.log(` ${'steerer'.padEnd(22)} ${'depth-best'.padStart(10)} ${'− breadth'.padStart(10)} ${'95% CI'.padStart(18)} ${'resolved Δ'.padStart(11)} ${'degrade'.padStart(8)}`) - console.log(` ${'-'.repeat(84)}`) - for (const r of ranked) { - console.log( - ` ${r.id.padEnd(22)} ${pct(r.bestRate).padStart(10)} ${pp(r.lift.point).padStart(10)} ${`[${pp(r.lift.low)},${pp(r.lift.high)}]`.padStart(18)} ${pp(r.liftRes.point).padStart(11)} ${pp(r.degradation).padStart(8)} ${sig(r.lift)}`, - ) - } - const best = ranked[0] - console.log(`\n WINNER: ${best?.id} — depth-best beats breadth ${best ? pp(best.lift.point) : 'n/a'} (${best ? sig(best.lift) : ''}). Degradation across steerers shows how much keep-best recovers.`) -} - -if (import.meta.url === `file://${process.argv[1]}`) { - main().catch((err) => { - console.error(`eops-gate: ${err instanceof Error ? (err.stack ?? err.message) : String(err)}`) - process.exit(1) - }) -} From 6f3c15deaedcd7096c2696b7d8d4c0d30780f381 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 05:44:27 -0600 Subject: [PATCH 4/8] feat(bench): package the optimization suite (runBenchmark) + clarify naming + onboarding fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pieces existed (Supervisor + observe + the depth/breadth strategies) but weren't wrapped as a usable suite, and the vocabulary was opaque. runBenchmark is the packaged front door: runBenchmark({ environment, tasks, worker, strategies: ['sample','refine'], budget }) → runs each strategy, scores by the environment's own deployable check, returns the per-strategy means + the paired-bootstrap lift of refine over sample. printBenchmarkReport gives the verdict. Resilient to transient per-task infra (skip, don't crash). Naming, made legible (public API; maps to internal depth/breadth — zero churn to the running internals): a task domain is an `Environment` (the AgenticSurface seam under the RL/gym-standard name); the strategies are `sample` (best-of-N / resample) and `refine` (attempt → critic reads trace → steer → repeat), named by what they DO, not the search tree's shape. Juniors call runBenchmark; seniors customize the hooks (worker.analystInstruction = the critic, Environment.score = the check) or drop to runAgentic for new strategies. Onboarding: deleted the orphaned empty examples/define-loop/ (defineLoop removed #194); fixed the dead examples/model-resolution link in docs/concepts.md. --- bench/src/run-benchmark.mts | 98 +++++++++++++++++++++++++++++++++++++ docs/concepts.md | 5 +- 2 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 bench/src/run-benchmark.mts diff --git a/bench/src/run-benchmark.mts b/bench/src/run-benchmark.mts new file mode 100644 index 0000000..17d6c8d --- /dev/null +++ b/bench/src/run-benchmark.mts @@ -0,0 +1,98 @@ +/** + * runBenchmark — the packaged optimization suite. Define a domain by implementing an + * Environment (open / tools / call / score / close); get the optimization strategies + * compared, scored by your own deployable check, with a paired-bootstrap report — FREE. + * + * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. An + * optimization STRATEGY is how you spend the budget to beat the check. Two primitives: + * + * sample — N independent attempts, keep the best-verifying one. ("best-of-N" / resample) + * refine — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback) + * + * Both run at equal budget; the headline is the paired lift of refine over sample. + * (Internally `sample`→breadth, `refine`→depth on the canonical Supervisor+observe loop.) + * + * Juniors call runBenchmark and read the report. Seniors customize the HOOKS: the critic + * (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the + * worker (the model), and can drop to runAgentic / the Supervisor for new strategies. + */ +import { type AgenticOptions, type AgenticSurface, type AgenticTask, runAgentic } from './agentic' +import { type PairedLift, pairedLift, pool } from './stats.mts' + +/** A checkable task domain — implement these 5 hooks and the suite does the rest. The + * same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */ +export type Environment = AgenticSurface + +/** How to spend the compute budget to beat the Environment's check. */ +export type Strategy = 'sample' | 'refine' +const modeForStrategy = { sample: 'breadth', refine: 'depth' } as const + +export interface BenchmarkConfig { + /** The task domain (5 hooks). */ + environment: Environment + /** The tasks to score across. */ + tasks: AgenticTask[] + /** The worker: model + router + (optional) the critic's instruction (the steerer knob). */ + worker: AgenticOptions + /** Which strategies to compare. Default: both. */ + strategies?: Strategy[] + /** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */ + budget?: number + /** Tasks scored in parallel. Default 3. */ + concurrency?: number +} + +export interface BenchmarkReport { + n: number + excluded: number + /** Mean verifier score per strategy (0..1). */ + perStrategy: Partial> + /** The headline: paired lift of refine over sample (present when both ran). */ + refineVsSample?: PairedLift +} + +/** Run the requested strategies over the tasks, scored by the Environment's own check, + * and return the per-strategy means + the paired-bootstrap lift of refine over sample. + * Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */ +export async function runBenchmark(cfg: BenchmarkConfig): Promise { + const strategies = cfg.strategies ?? ['sample', 'refine'] + const budget = cfg.budget ?? 3 + const concurrency = cfg.concurrency ?? 3 + + const rows = await pool(cfg.tasks, concurrency, async (task) => { + const scores: Partial> = {} + try { + for (const s of strategies) { + const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, mode: modeForStrategy[s], budget }) + scores[s] = r.score + } + return scores + } catch { + return null // transient infra on this task — exclude it + } + }) + + const ok = rows.filter((r): r is Partial> => r !== null) + const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0) + const perStrategy: Partial> = {} + for (const s of strategies) perStrategy[s] = mean(ok.map((r) => r[s] ?? 0)) + + const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy } + if (strategies.includes('refine') && strategies.includes('sample')) { + report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0)) + } + return report +} + +/** Pretty-print a report — the "free optimization" verdict. */ +export function printBenchmarkReport(report: BenchmarkReport): void { + const pct = (x: number) => `${(x * 100).toFixed(1)}%` + const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp` + console.log(`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`) + for (const [s, v] of Object.entries(report.perStrategy)) console.log(` ${s.padEnd(8)} ${pct(v ?? 0)}`) + const l = report.refineVsSample + if (l) { + const sig = l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.' + console.log(` refine − sample: ${pp(l.point)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`) + } +} diff --git a/docs/concepts.md b/docs/concepts.md index 4fb338d..3117b77 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -140,6 +140,5 @@ agents because nothing in this list is baked into it. 2. `examples/sandbox-stream-backend/` — what streaming looks like. 3. `examples/chat-handler/` — `handleChatTurn` — the centerpiece chat handler. 4. `examples/runtime-run/` — the production-run row + cost ledger. -5. `examples/model-resolution/` — pick + validate a model. -6. `examples/agent-into-reviewer/` — pipe one runtime's stream into a reviewer agent. -7. The `README.md` entry-point table — every other primitive, one row each. +5. `examples/agent-into-reviewer/` — pipe one runtime's stream into a reviewer agent. +6. The `README.md` entry-point table — model resolution + every other primitive, one row each. From e52b8d5e02a608bd3cbf273ccc719de0c527ead3 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 05:53:05 -0600 Subject: [PATCH 5/8] feat(bench): make Strategy a first-class, OPEN abstraction (author your own) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The question: when we collapse to "refine", can a dev create their OWN strategy? Before: no — runAgentic took mode:'depth'|'breadth', a CLOSED enum. The capability existed (a strategy is an Agent) but the door wasn't cut. Now: `Strategy` is an exported interface — `{ name, driver(surface, task, opts, budget) => Agent }`. A strategy builds the driver Agent the Supervisor runs; author your own by returning an Agent whose act() spawns shots/analysts via scope.spawn/next/send. `refine` and `sample` ship as instances AND the reference driver implementations (depthDriver/ breadthDriver) are exported to copy. runAgentic accepts a `strategy` (mode kept for back-compat); runBenchmark takes `Strategy[]` — pass the built-ins or your own. What's under the words: sample = K independent attempts, keep the best-verifying (best-of-N / resample) refine = attempt → observe() reads the trace → steer the next → repeat (iterate) A multi-agent "team" is just a Strategy whose driver spawns several different agents — same recursive Agent atom, coordinated over the Scope. --- bench/src/agentic.ts | 47 ++++++++++++++++++++++++++++--------- bench/src/run-benchmark.mts | 34 ++++++++++++++------------- 2 files changed, 54 insertions(+), 27 deletions(-) diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts index 69b6a48..b46a72c 100644 --- a/bench/src/agentic.ts +++ b/bench/src/agentic.ts @@ -320,7 +320,7 @@ export interface AgenticRunResult { const perChild = (innerTurns: number): Budget => ({ maxIterations: innerTurns + 1, maxTokens: 1_000_000 }) /** DEPTH: one persistent artifact, carried across analyst-steered shots. */ -function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { maxShots: number }): Agent> { +export function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { maxShots: number }): Agent> { const innerTurns = opts.innerTurns ?? 4 let pendingSteer: string | undefined // analyst-derived steer carried between shots return { @@ -366,7 +366,7 @@ function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOp } /** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */ -function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { width: number }): Agent> { +export function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: { width: number }): Agent> { const innerTurns = opts.innerTurns ?? 4 return { name: 'breadth', @@ -395,26 +395,51 @@ function breadthDriver(surface: AgenticSurface, task: AgenticTask, opts: Agentic } } +/** + * A Strategy is HOW you spend the compute budget to beat the Environment's check — it + * builds the driver `Agent` the Supervisor runs. This is the OPEN extension point: a dev + * authors their own by implementing `driver()` to return an Agent whose `act()` spawns + * shots/analysts via `scope.spawn` / `scope.next` / `scope.send`. The two built-ins are + * the reference implementations to copy: + * sample — K INDEPENDENT attempts, keep the best-verifying (best-of-N / resample). + * refine — attempt → observe() reads the trace → steer the next → repeat (iterate). + * (A multi-agent "team" is just a Strategy whose driver spawns several different agents.) + */ +export interface Strategy { + readonly name: string + driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent> +} + +export const sample: Strategy = { + name: 'sample', + driver: (surface, task, opts, budget) => breadthDriver(surface, task, opts, { width: budget }), +} +export const refine: Strategy = { + name: 'refine', + driver: (surface, task, opts, budget) => depthDriver(surface, task, opts, { maxShots: budget }), +} + export interface RunAgenticOptions extends AgenticOptions { surface: AgenticSurface task: AgenticTask - mode: 'depth' | 'breadth' - /** depth: max shots; breadth: rollout width. */ + /** A Strategy (the open way) — author/pass your own. Overrides `mode` when present. */ + strategy?: Strategy + /** Built-in shorthand: 'depth'→refine, 'breadth'→sample. Default 'depth'. */ + mode?: 'depth' | 'breadth' + /** budget: refine→max shots; sample→rollout width. */ budget: number rootBudget?: Budget } -/** Run the chosen driver through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */ +/** Run a Strategy through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */ export async function runAgentic(opts: RunAgenticOptions): Promise { - const driver = - opts.mode === 'depth' - ? depthDriver(opts.surface, opts.task, opts, { maxShots: opts.budget }) - : breadthDriver(opts.surface, opts.task, opts, { width: opts.budget }) + const strategy: Strategy = opts.strategy ?? (opts.mode === 'breadth' ? sample : refine) + const driver = strategy.driver(opts.surface, opts.task, opts, opts.budget) const supervisor = createSupervisor>() const root: Budget = opts.rootBudget ?? { maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2), maxTokens: 1_000_000_000 } const result = await supervisor.run(driver, undefined, { budget: root, - runId: `agentic:${opts.mode}:${opts.task.id}`, + runId: `agentic:${strategy.name}:${opts.task.id}`, journal: new InMemorySpawnJournal(), blobs: new InMemoryResultBlobStore(), executors: agenticRegistry(opts.surface, opts), @@ -422,7 +447,7 @@ export async function runAgentic(opts: RunAgenticOptions): Promise> - /** The headline: paired lift of refine over sample (present when both ran). */ + /** Mean verifier score per strategy (keyed by strategy.name, 0..1). */ + perStrategy: Record + /** The headline when exactly `refine` + `sample` ran: paired lift of refine over sample. */ refineVsSample?: PairedLift } @@ -55,16 +56,16 @@ export interface BenchmarkReport { * and return the per-strategy means + the paired-bootstrap lift of refine over sample. * Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */ export async function runBenchmark(cfg: BenchmarkConfig): Promise { - const strategies = cfg.strategies ?? ['sample', 'refine'] + const strategies = cfg.strategies ?? [sample, refine] const budget = cfg.budget ?? 3 const concurrency = cfg.concurrency ?? 3 const rows = await pool(cfg.tasks, concurrency, async (task) => { - const scores: Partial> = {} + const scores: Record = {} try { for (const s of strategies) { - const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, mode: modeForStrategy[s], budget }) - scores[s] = r.score + const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, strategy: s, budget }) + scores[s.name] = r.score } return scores } catch { @@ -72,13 +73,14 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise> => r !== null) + const ok = rows.filter((r): r is Record => r !== null) const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0) - const perStrategy: Partial> = {} - for (const s of strategies) perStrategy[s] = mean(ok.map((r) => r[s] ?? 0)) + const perStrategy: Record = {} + for (const s of strategies) perStrategy[s.name] = mean(ok.map((r) => r[s.name] ?? 0)) const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy } - if (strategies.includes('refine') && strategies.includes('sample')) { + const names = strategies.map((s) => s.name) + if (names.includes('refine') && names.includes('sample')) { report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0)) } return report From 1dfbfd6746daf4b4a72e9dae2a9ba3a6a27a2466 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 06:03:22 -0600 Subject: [PATCH 6/8] =?UTF-8?q?feat(bench):=20defineStrategy=20+=20composa?= =?UTF-8?q?ble=20steps=20=E2=80=94=20author=20a=20loop=20in=20~15=20lines?= =?UTF-8?q?=20(skillifiable)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original goal: loops compact enough to skillify, so agents author them. A 70-line Supervisor driver isn't that. This adds the composable LEGO: defineStrategy(name, async ({ shot, critique, surface, budget }) => { ...compose... }) A strategy body gets two steps — shot() (one worker attempt over an artifact) and critique() (the firewalled analyst reads the trace → a steer) — with ZERO Supervisor/ Scope/spawn/leaf/drainOne ceremony (all of it lives inside defineStrategy now). That is the unit an agent or a skill can emit. Proof: adaptiveRefine — a NEW strategy (refine, but ABANDON-and-restart when a steered shot fails to improve = branch-when-stuck, the widen/MCTS idea the depth-stuck failure motivated), authored entirely from the steps, scored keep-best. ~22 lines of pure strategy logic, no plumbing. Behavior-preserving: the proven refine/sample drivers (depthDriver/breadthDriver) are UNTOUCHED — the +16.4pp result + GEPA stay valid. The steps replicate their exact spawn/drain pattern, so a step-authored strategy behaves identically. Typecheck-verified; adaptiveRefine live-smoke pending the gym (GEPA has it). --- bench/src/agentic.ts | 121 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/bench/src/agentic.ts b/bench/src/agentic.ts index b46a72c..cb8eb7e 100644 --- a/bench/src/agentic.ts +++ b/bench/src/agentic.ts @@ -308,7 +308,8 @@ async function drainOne(scope: Scope>): Promise depthDriver(surface, task, opts, { maxShots: budget }), } +// ── The composable LEGO: author a strategy in ~15 lines from two steps ─────────── +// +// A strategy body gets `shot()` (run one worker attempt over an artifact) and +// `critique()` (the firewalled analyst reads the trace → a steer). Compose them — no +// Supervisor/Scope ceremony. This is the skillifiable unit: an agent can emit a +// `defineStrategy(name, body)` of a few step-calls; it can't reliably emit a 70-line +// driver. (depthDriver/breadthDriver are the hand-written reference impls; refine/sample +// stay on them — proven — while NEW strategies are authored compactly here.) + +export interface ShotSpec { + /** present ⇒ continue this artifact (depth); absent ⇒ the shot opens a fresh one (sample/restart). */ + handle?: ArtifactHandle + messages?: Msg[] + steer?: string +} +export interface StrategyResult { + score: number + resolved: boolean + completions: number + progression: number[] + shots: number +} +/** What a strategy body composes with: the domain surface, the budget, and the two steps. */ +export interface StrategyCtx { + readonly surface: AgenticSurface + readonly task: AgenticTask + readonly opts: AgenticOptions + readonly budget: number + readonly scope: Scope> + /** Run ONE worker shot; its scored result, or null if it went down. */ + shot(spec?: ShotSpec): Promise + /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */ + critique(messages: Msg[]): Promise +} + +/** Author a Strategy from the composable steps — the open, compact way. */ +export function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise): Strategy { + return { + name, + driver: (surface, task, opts, budget) => ({ + name, + async act(_t, scope): Promise> { + let seq = 0 + const innerTurns = opts.innerTurns ?? 4 + const ctx: StrategyCtx = { + surface, + task, + opts, + budget, + scope, + async shot(spec) { + const child = leaf(`shot:${seq}`, 'shot') + seq += 1 + const res = scope.spawn(child, { task, handle: spec?.handle, messages: spec?.messages, steer: spec?.steer } as ShotTask, { budget: perChild(innerTurns), label: child.name }) + if (!res.ok) return null + const settled = await drainOne(scope) + return settled.kind === 'down' ? null : (settled.out as unknown as ShotResult) + }, + async critique(messages) { + const child = leaf(`analyst:${seq}`, 'analyst') + seq += 1 + const res = scope.spawn(child, { task, messages }, { budget: perChild(1), label: child.name }) + if (!res.ok) return null + const settled = await drainOne(scope) + if (settled.kind === 'down') return null + const findings = settled.out as unknown as string + return /^\s*COMPLETE\b/i.test(findings) ? null : findings + }, + } + const r = await run(ctx) + return { kind: 'done', deliverable: { mode: name, ...r } } + }, + }), + } +} + +/** A NEW strategy, authored from the steps (~20 lines): refine, but when a steered shot + * fails to improve the score it ABANDONS that line and restarts fresh (branch-when-stuck) + * — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best + * checkpoint across all lines), the deployable metric. This is the "experts build BETTER + * optimizations" path: a new technique, compact, with zero Supervisor ceremony. */ +export const adaptiveRefine = defineStrategy('adaptiveRefine', async ({ surface, task, budget, shot, critique }) => { + let handle = await surface.open(task) + const progression: number[] = [] + let messages: Msg[] | undefined + let steer: string | undefined + let completions = 0 + let best = -1 + let shots = 0 + try { + for (shots = 0; shots < budget; shots += 1) { + const out = await shot({ handle, messages, steer }) + if (!out) break + completions += out.completions + progression.push(out.score) + if (out.score >= 1) break + if (out.score <= best) { + // Stuck: steering isn't improving this line — abandon it, restart fresh. + await surface.close(handle) + handle = await surface.open(task) + messages = undefined + steer = undefined + continue + } + best = out.score + messages = out.messages + const findings = await critique(out.messages) + completions += 1 + if (!findings) break + steer = `A reviewer flagged unfinished items:\n${findings}\n\nAddress each with the tools, verify they took, then continue.` + } + const score = progression.length ? Math.max(...progression) : 0 + return { score, resolved: score >= 1, completions, progression, shots } + } finally { + await surface.close(handle) + } +}) + export interface RunAgenticOptions extends AgenticOptions { surface: AgenticSurface task: AgenticTask From ab137984f9422104d98433a268be7707f7931988 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 06:15:59 -0600 Subject: [PATCH 7/8] =?UTF-8?q?docs(bench):=20strategy-demo=20example=20?= =?UTF-8?q?=E2=80=94=20the=20optimization=20suite=20in=203=20layers=20(gym?= =?UTF-8?q?-free,=20runnable)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The missing onboarding piece: a runnable demo of the whole suite on a toy "counter" Environment (needs only a router key — no dataset, no sandbox). Shows all three layers: 1. runBenchmark(env, …) — default strategies compared, free. 2. strategies: [sample, refine, adaptiveRefine] — pick, named by behavior. 3. defineStrategy('doubleCheck', body) — author your own in ~10 lines from shot()+critique(), zero Supervisor ceremony. The skillifiable unit. Verified: runs end-to-end through the canonical Supervisor; all 4 strategies execute and score via the Environment's own check. README documents the model + the customization hooks. --- bench/src/examples/README.md | 46 ++++++++++ bench/src/examples/strategy-demo.mts | 120 +++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 bench/src/examples/README.md create mode 100644 bench/src/examples/strategy-demo.mts diff --git a/bench/src/examples/README.md b/bench/src/examples/README.md new file mode 100644 index 0000000..edf735e --- /dev/null +++ b/bench/src/examples/README.md @@ -0,0 +1,46 @@ +# Strategy demo — the optimization suite in three layers + +`strategy-demo.mts` is the smallest end-to-end demonstration of the optimization suite. +It runs on a toy "counter" `Environment` so it needs only a router key — no benchmark +dataset, no sandbox, no gym. + +``` +dotenvx run -f …/.env.keys -- env WORKER_MODEL=gpt-4o-mini tsx src/examples/strategy-demo.mts +``` + +## The model + +You have a **task**, a deployable **check**, and a compute **budget**. A *strategy* is +**how you spend the budget to beat the check**. You implement an `Environment` (5 hooks) +and get the strategies compared, scored by your own check, for free. + +## The three layers (each is a few lines in the demo) + +1. **Just run it** — `runBenchmark({ environment, tasks, worker })` compares the default + strategies and reports the paired lift. Black box; no vocabulary needed. + +2. **Pick strategies** — pass `strategies: [sample, refine, adaptiveRefine]`. Named by + what they *do*: + - **`sample`** — N independent attempts, keep the best-verifying (best-of-N / resample). + - **`refine`** — attempt → a critic reads the trace → steer the next → repeat (iterate). + - **`adaptiveRefine`** — refine, but abandon-and-restart a line that stops improving + (branch-when-stuck). + +3. **Author your own** — `defineStrategy(name, body)`. A strategy body composes two steps + — `shot()` (one worker attempt over an artifact) and `critique()` (the firewalled + analyst reads the trace → a steer) — with **zero** Supervisor/Scope ceremony. The demo + authors `doubleCheck` inline in ~10 lines. This is the unit a skill (or an agent) emits. + +## The hooks you customize (world-class-DX surface) + +- **the check / verifier** → `Environment.score` (your deployable success criterion) +- **the critic / steerer** → `worker.analystInstruction` (the analyst prompt; GEPA tunes this) +- **the worker** → the model (`worker.model`) +- **the strategy** → `defineStrategy` (or drop to `runAgentic` / the Supervisor for novel topologies) + +## Where the real results live + +On a trivial task all strategies tie. The differences (e.g. refine/adaptiveRefine beating +sample on stateful agentic work, +16.4pp on EnterpriseOps-Gym) show on real domains — see +`bench/HARNESS.md` and `bench/src/agentic-run.mts` (the EOPS Environment), and +`bench/src/eops-gepa.mts` (GEPA evolving the analyst/critic prompt against the check). diff --git a/bench/src/examples/strategy-demo.mts b/bench/src/examples/strategy-demo.mts new file mode 100644 index 0000000..63edcb0 --- /dev/null +++ b/bench/src/examples/strategy-demo.mts @@ -0,0 +1,120 @@ +/** + * Strategy demo — the optimization suite in three layers, on a toy Environment (no gym). + * + * The whole idea in one file: you implement an `Environment` (5 hooks: open/tools/call/ + * score/close), and you get optimization STRATEGIES — sample (best-of-N), refine + * (iterate-with-feedback), and any you author — compared and scored by your own check, + * for free. This uses a trivial "counter" environment so it runs with just a router key + * (no benchmark dataset, no sandbox). + * + * dotenvx run -f …/.env.keys -- env WORKER_MODEL=gpt-4o-mini tsx src/examples/strategy-demo.mts + * + * The three layers shown below: + * 1. just run it — runBenchmark(env, …) compares the default strategies, free. + * 2. pick strategies — pass [sample, refine, adaptiveRefine]. + * 3. author your own — defineStrategy(name, body) in ~10 lines, no Supervisor ceremony. + */ +import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, refine, sample } from '../agentic' +import { type Environment, printBenchmarkReport, runBenchmark } from '../run-benchmark.mts' + +// ── 1. Implement an Environment (the only thing a new domain writes) ────────────── +// A toy: the agent must drive a counter to exactly the target using the increment tool. +// score = how close it got. This is the seam every real benchmark (EOPS, a coding repo, +// a browser task) implements the same way — open a checkable artifact, expose tools, +// score it. Here the "artifact" is just an in-memory counter. + +const target = 5 +const counters = new Map() + +const counterEnv: Environment = { + name: 'counter', + async open(_task) { + const id = `counter-${Math.random().toString(36).slice(2, 8)}` + counters.set(id, { count: 0 }) + return { id, surface: 'counter' } satisfies ArtifactHandle + }, + async tools() { + return [ + { type: 'function', function: { name: 'increment', description: 'Add 1 to the counter.', parameters: { type: 'object', properties: {} } } }, + { type: 'function', function: { name: 'read_count', description: 'Read the current counter value.', parameters: { type: 'object', properties: {} } } }, + ] + }, + async call(handle, name) { + const c = counters.get(handle.id) + if (!c) return 'ERROR: no such counter' + if (name === 'increment') { + c.count += 1 + return `count is now ${c.count}` + } + if (name === 'read_count') return `count is ${c.count}` + return `ERROR: unknown tool ${name}` + }, + // The deployable CHECK: exact hits toward the target. score = passes/total. + async score(_task, handle) { + const c = counters.get(handle.id) + const count = c?.count ?? 0 + return { passes: Math.min(count, target), total: target, errored: 0 } + }, + async close(handle) { + counters.delete(handle.id) + }, +} + +const task: AgenticTask = { + id: 'counter-to-5', + systemPrompt: 'You operate a counter with tools.', + userPrompt: `Use the increment tool to bring the counter to exactly ${target}. Use read_count to verify before you finish. Reply DONE when the count equals ${target}.`, +} + +// ── 3. Author your OWN strategy in ~10 lines — the lego (no Supervisor ceremony) ── +// "doubleCheck": one attempt, then critique twice (extra steering passes) before stopping. +// A strategy body composes two steps: shot() (one worker attempt) + critique() (the +// firewalled analyst → a steer). That's it. This is the skillifiable unit. +const doubleCheck = defineStrategy('doubleCheck', async ({ surface, task: t, budget, shot, critique }) => { + const handle = await surface.open(t) + const progression: number[] = [] + let messages: Record[] | undefined + let steer: string | undefined + let completions = 0 + try { + for (let i = 0; i < budget; i += 1) { + const out = await shot({ handle, messages, steer }) + if (!out) break + completions += out.completions + progression.push(out.score) + if (out.score >= 1) break + messages = out.messages + const findings = await critique(out.messages) + completions += 1 + if (!findings) break + steer = `Not done yet. ${findings}` + } + const score = progression.length ? Math.max(...progression) : 0 + return { score, resolved: score >= 1, completions, progression, shots: progression.length } + } finally { + await surface.close(handle) + } +}) + +async function main(): Promise { + const worker = { + routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1', + routerKey: process.env.TANGLE_API_KEY ?? '', + model: process.env.WORKER_MODEL ?? 'gpt-4o-mini', + innerTurns: 6, + } + if (!worker.routerKey) throw new Error('set TANGLE_API_KEY (the worker calls the router)') + + console.log('Layer 1 — just run it (default strategies, scored by the env\'s own check):') + printBenchmarkReport(await runBenchmark({ environment: counterEnv, tasks: [task], worker, budget: 3 })) + + console.log('\nLayer 2+3 — pick the built-ins AND your own authored strategy:') + printBenchmarkReport( + await runBenchmark({ environment: counterEnv, tasks: [task], worker, budget: 3, strategies: [sample, refine, adaptiveRefine, doubleCheck] }), + ) +} + +main().catch((e) => { + console.error(e instanceof Error ? (e.stack ?? e.message) : String(e)) + process.exit(1) +}) From 658160bc0176739a49e87e2c2a890330c97b09de Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 9 Jun 2026 06:25:50 -0600 Subject: [PATCH 8/8] =?UTF-8?q?chore(examples):=20clearer=20names=20?= =?UTF-8?q?=E2=80=94=20drop=20the=20confusing=20`with-`=20prefix;=20clarif?= =?UTF-8?q?y=20intent?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Disciplined subset of the examples-naming audit (NOT the proposed 01-08 numbering / .deprecated quarantine — that's churn for throwaway examples and the README already orders them): with-knowledge-readiness → knowledge-gating (`with-` read as an optional toggle) with-intelligence-export → intelligence-export (same) agent-into-reviewer → pipe-into-reviewer (signals the 2-runtime piping) KEPT runtime-run (it teaches startRuntimeRun — the name matches the product API) and agents-of-all-shapes (memorable + has a test). git mv preserves history; README + docs/concepts + all internal self-references updated; zero stragglers. --- docs/concepts.md | 2 +- examples/README.md | 8 ++++---- examples/agents-of-all-shapes/README.md | 2 +- examples/agents-of-all-shapes/shapes.ts | 2 +- .../README.md | 4 ++-- .../intelligence-export.ts} | 2 +- .../README.md | 4 ++-- .../knowledge-gating.ts} | 2 +- .../{agent-into-reviewer => pipe-into-reviewer}/README.md | 2 +- .../pipe-into-reviewer.ts} | 2 +- 10 files changed, 15 insertions(+), 15 deletions(-) rename examples/{with-intelligence-export => intelligence-export}/README.md (87%) rename examples/{with-intelligence-export/with-intelligence-export.ts => intelligence-export/intelligence-export.ts} (97%) rename examples/{with-knowledge-readiness => knowledge-gating}/README.md (91%) rename examples/{with-knowledge-readiness/with-knowledge-readiness.ts => knowledge-gating/knowledge-gating.ts} (96%) rename examples/{agent-into-reviewer => pipe-into-reviewer}/README.md (93%) rename examples/{agent-into-reviewer/agent-into-reviewer.ts => pipe-into-reviewer/pipe-into-reviewer.ts} (98%) diff --git a/docs/concepts.md b/docs/concepts.md index 3117b77..00ae5ec 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -140,5 +140,5 @@ agents because nothing in this list is baked into it. 2. `examples/sandbox-stream-backend/` — what streaming looks like. 3. `examples/chat-handler/` — `handleChatTurn` — the centerpiece chat handler. 4. `examples/runtime-run/` — the production-run row + cost ledger. -5. `examples/agent-into-reviewer/` — pipe one runtime's stream into a reviewer agent. +5. `examples/pipe-into-reviewer/` — pipe one runtime's stream into a reviewer agent. 6. The `README.md` entry-point table — model resolution + every other primitive, one row each. diff --git a/examples/README.md b/examples/README.md index aa3e1bd..59c777d 100644 --- a/examples/README.md +++ b/examples/README.md @@ -9,7 +9,7 @@ Every example imports from `@tangle-network/agent-runtime` (the same surface con | # | Example | One sentence | |---|---|---| | 1 | [`chat-handler/`](./chat-handler/) | `handleChatTurn` — the production chat turn lifecycle every product runs | -| 2 | [`with-knowledge-readiness/`](./with-knowledge-readiness/) | Same chat handler + `requiredKnowledge` + `decideKnowledgeReadiness` gating | +| 2 | [`knowledge-gating/`](./knowledge-gating/) | Same chat handler + `requiredKnowledge` + `decideKnowledgeReadiness` gating | | 3 | [`sanitized-telemetry-streaming/`](./sanitized-telemetry-streaming/) | Same chat handler + redaction-by-default telemetry collector | | 4 | [`runtime-run/`](./runtime-run/) | Same chat handler + `startRuntimeRun` + cost ledger persistence | @@ -38,8 +38,8 @@ These were standalone examples in an earlier release. The patterns are now folde - [`openai-stream-backend/`](./openai-stream-backend/) — `createOpenAICompatibleBackend` - [`sse-stream/`](./sse-stream/) — SSE helpers for browser routes - [`sanitized-telemetry/`](./sanitized-telemetry/) — non-streaming counterpart to `sanitized-telemetry-streaming` -- [`agent-into-reviewer/`](./agent-into-reviewer/) — pipe one runtime's stream into a reviewer agent (advanced 2-runtime topology) -- [`with-intelligence-export/`](./with-intelligence-export/) — ship loop traces to Tangle Intelligence (`createOtelExporter` + raw OTLP) for failure-correlation + quality insights +- [`pipe-into-reviewer/`](./pipe-into-reviewer/) — pipe one runtime's stream into a reviewer agent (advanced 2-runtime topology) +- [`intelligence-export/`](./intelligence-export/) — ship loop traces to Tangle Intelligence (`createOtelExporter` + raw OTLP) for failure-correlation + quality insights ## Conventions @@ -54,7 +54,7 @@ From the agent-runtime repo root, in the suggested learning order: ```bash # Start here pnpm tsx examples/chat-handler/chat-handler.ts -pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts +pnpm tsx examples/knowledge-gating/knowledge-gating.ts pnpm tsx examples/sanitized-telemetry-streaming/sanitized-telemetry-streaming.ts pnpm tsx examples/runtime-run/runtime-run.ts diff --git a/examples/agents-of-all-shapes/README.md b/examples/agents-of-all-shapes/README.md index cfdbc36..29631de 100644 --- a/examples/agents-of-all-shapes/README.md +++ b/examples/agents-of-all-shapes/README.md @@ -49,7 +49,7 @@ frameworks already emit them; you add `score`. | Shape | File | Live wiring | |---|---|---| -| **Tangle runtime / router (tcloud)** | `shapes.ts` → `tangleRuntimeRuns` | `createOtelExporter` + `loopEventToOtelSpan` (see `examples/with-intelligence-export`) | +| **Tangle runtime / router (tcloud)** | `shapes.ts` → `tangleRuntimeRuns` | `createOtelExporter` + `loopEventToOtelSpan` (see `examples/intelligence-export`) | | **OpenAI-compatible** (tcloud / OpenRouter / OpenAI / vLLM) | `shapes.ts` → `openAiCompatibleRuns` | any OpenAI client at the router's `baseURL`; emit a GenAI span per call | | **Mastra** | `shapes.ts` → `mastraRuns` | Mastra's native OTLP exporter → `${INTELLIGENCE_BASE}/v1/otlp/v1/traces` | | **Claude Agent SDK** | `shapes.ts` → `claudeAgentSdkRuns` | wrap `query()`, one GenAI span per turn from `msg.usage` | diff --git a/examples/agents-of-all-shapes/shapes.ts b/examples/agents-of-all-shapes/shapes.ts index 369da2c..4a7bd92 100644 --- a/examples/agents-of-all-shapes/shapes.ts +++ b/examples/agents-of-all-shapes/shapes.ts @@ -59,7 +59,7 @@ function batch(spec: BatchSpec): AgentRun[] { * 1. Tangle agent-runtime / router (tcloud). * * LIVE: agent-runtime already emits every loop event; ship them with the - * built-in exporter (see `examples/with-intelligence-export`): + * built-in exporter (see `examples/intelligence-export`): * const exporter = createOtelExporter({ endpoint, headers }) * for await (const e of runAgentTaskStream({ task, backend })) { * exporter.exportSpan(loopEventToOtelSpan({ kind: e.type, runId, ... }, traceId)) diff --git a/examples/with-intelligence-export/README.md b/examples/intelligence-export/README.md similarity index 87% rename from examples/with-intelligence-export/README.md rename to examples/intelligence-export/README.md index b0ddf49..8dbd044 100644 --- a/examples/with-intelligence-export/README.md +++ b/examples/intelligence-export/README.md @@ -1,4 +1,4 @@ -# with-intelligence-export +# intelligence-export Ship agent-runtime traces to **Tangle Intelligence** and get back insights: failure correlations (relative risk + p-value), latency percentiles, and an @@ -18,5 +18,5 @@ The tenant is resolved from the Bearer key, never the payload. Read insights back from the dashboard or `GET /v1/insights/outputs?kind=report`. ```bash -TANGLE_API_KEY=sk-tan-... npx tsx examples/with-intelligence-export/with-intelligence-export.ts +TANGLE_API_KEY=sk-tan-... npx tsx examples/intelligence-export/intelligence-export.ts ``` diff --git a/examples/with-intelligence-export/with-intelligence-export.ts b/examples/intelligence-export/intelligence-export.ts similarity index 97% rename from examples/with-intelligence-export/with-intelligence-export.ts rename to examples/intelligence-export/intelligence-export.ts index 4848c4f..5ea363d 100644 --- a/examples/with-intelligence-export/with-intelligence-export.ts +++ b/examples/intelligence-export/intelligence-export.ts @@ -18,7 +18,7 @@ * Read insights back from the dashboard or `GET /v1/insights/outputs` with * the same key. Tenant resolves from the Bearer key, never the payload. * - * Run: TANGLE_API_KEY=sk-tan-... npx tsx examples/with-intelligence-export/with-intelligence-export.ts + * Run: TANGLE_API_KEY=sk-tan-... npx tsx examples/intelligence-export/intelligence-export.ts */ import { type AgentBackendInput, diff --git a/examples/with-knowledge-readiness/README.md b/examples/knowledge-gating/README.md similarity index 91% rename from examples/with-knowledge-readiness/README.md rename to examples/knowledge-gating/README.md index 74a2ce3..3f434e9 100644 --- a/examples/with-knowledge-readiness/README.md +++ b/examples/knowledge-gating/README.md @@ -1,4 +1,4 @@ -# with-knowledge-readiness +# knowledge-gating A task that declares required knowledge. The runtime scores readiness before running the control loop and stops if a blocking requirement is @@ -9,7 +9,7 @@ instead of failing the run. ## Run ```bash -pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts +pnpm tsx examples/knowledge-gating/knowledge-gating.ts ``` ## What it shows diff --git a/examples/with-knowledge-readiness/with-knowledge-readiness.ts b/examples/knowledge-gating/knowledge-gating.ts similarity index 96% rename from examples/with-knowledge-readiness/with-knowledge-readiness.ts rename to examples/knowledge-gating/knowledge-gating.ts index 7414ccf..1762897 100644 --- a/examples/with-knowledge-readiness/with-knowledge-readiness.ts +++ b/examples/knowledge-gating/knowledge-gating.ts @@ -3,7 +3,7 @@ * control loop runs and gates the task on readiness. * * Run with: - * pnpm tsx examples/with-knowledge-readiness/with-knowledge-readiness.ts + * pnpm tsx examples/knowledge-gating/knowledge-gating.ts */ import type { KnowledgeRequirement } from '@tangle-network/agent-eval' diff --git a/examples/agent-into-reviewer/README.md b/examples/pipe-into-reviewer/README.md similarity index 93% rename from examples/agent-into-reviewer/README.md rename to examples/pipe-into-reviewer/README.md index 9e34505..e4c48d1 100644 --- a/examples/agent-into-reviewer/README.md +++ b/examples/pipe-into-reviewer/README.md @@ -19,5 +19,5 @@ task, backend, input })` with the same shape — an async iterable of events. The reviewer adapter is unchanged. ```bash -pnpm tsx examples/agent-into-reviewer/agent-into-reviewer.ts +pnpm tsx examples/pipe-into-reviewer/pipe-into-reviewer.ts ``` diff --git a/examples/agent-into-reviewer/agent-into-reviewer.ts b/examples/pipe-into-reviewer/pipe-into-reviewer.ts similarity index 98% rename from examples/agent-into-reviewer/agent-into-reviewer.ts rename to examples/pipe-into-reviewer/pipe-into-reviewer.ts index 3e0ec53..2afa8c1 100644 --- a/examples/agent-into-reviewer/agent-into-reviewer.ts +++ b/examples/pipe-into-reviewer/pipe-into-reviewer.ts @@ -10,7 +10,7 @@ * other agent-runtime call uses. * * Run with: - * pnpm tsx examples/agent-into-reviewer/agent-into-reviewer.ts + * pnpm tsx examples/pipe-into-reviewer/pipe-into-reviewer.ts */ import type { AgentAdapter } from '@tangle-network/agent-runtime'