Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bench/src/agentic-eops.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* AppWorld, terminal-bench) ships its own file like this one — the drivers in agentic.ts never change.
*/

import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from './agentic'
import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from '@tangle-network/agent-runtime/loops'
import { callTool, deleteDb, type GymServer, type GymVerifier, loadTools, runVerifiers, seed } from './gym-agent'

interface EopsMeta {
Expand Down
2 changes: 1 addition & 1 deletion bench/src/agentic-run.mts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* TASKS=4 MAX_SHOTS=5 WIDTH=5 INNER_TURNS=4 WORKER_MODEL=gpt-4.1 tsx src/agentic-run.mts
*/

import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
import { type AgenticOptions, type AgenticTask, runAgentic } from '@tangle-network/agent-runtime/loops'
import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'

const must = (k: string): string => {
Expand Down
149 changes: 149 additions & 0 deletions bench/src/eops-corpus-ab.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/**
* The corpus flywheel A/B — primed-vs-cold at equal compute. THE across-run experiment
* (docs/research/layer-across-run.md): does run N+1 improve because the system learned
* from run N?
*
* Two arms over the SAME task stream, same order, canonical depth (Supervisor + observe):
* cold — every run fresh (the baseline as measured at +16.4pp vs breadth).
* primed — before each run, query the corpus (trace-derived facts accumulated by the
* analyst's observe() pass on PRIOR runs) and fold the top-k into the task's
* systemPrompt; after each run, observe() appends new facts. Zero extra LLM
* calls (the analyst already runs); priming is prompt text — equal compute.
*
* Reported: per-position scores, paired lift, the SLOPE (first-half vs second-half lift —
* the flywheel signature is a GROWING advantage), fact uptake counts, and a frozen
* HOLDOUT: a disjoint slice run primed-from-the-accumulated-corpus vs cold (does the
* learned knowledge transfer to fresh tasks?).
*
* Falsifiers designed in (layer-across-run.md): context pollution (cap k, report dose),
* stale/instance facts (the gym DB resets per task — only PROCEDURAL facts can help),
* judge leakage (observe() is structurally trace-only), worker disregard (uptake column).
*
* docker run -d --rm --name eops -p 8006:8005 shivakrishnareddyma225/enterpriseops-gym-mcp-itsm:latest
* EOPS_GYM_DBS_DIR=… N=16 HOLDOUT=4 K_FACTS=3 WORKER_MODEL=deepseek-v4-pro tsx src/eops-corpus-ab.mts
*/
import { type AgenticOptions, type AgenticTask, FileCorpus, runAgentic } from '@tangle-network/agent-runtime/loops'
import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
import { type PairedLift, pairedLift } from './stats.mts'

function must(name: string): string {
const v = process.env[name]
if (!v) throw new Error(`env ${name} is required`)
return v
}

async function loadItsmTasks(n: number, offset = 0): Promise<AgenticTask[]> {
const url = `https://datasets-server.huggingface.co/rows?dataset=${encodeURIComponent('ServiceNow-AI/EnterpriseOps-Gym')}&config=oracle&split=itsm&offset=${offset}&length=${n}`
const res = await fetch(url)
if (!res.ok) throw new Error(`EOPS HF rows HTTP ${res.status}`)
const body = (await res.json()) as { rows?: Array<{ row: Parameters<typeof eopsTaskFromRow>[0] }> }
return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row))
}

const tags = ['eops', 'itsm', 'corpus-ab']
const pct = (x: number) => `${(x * 100).toFixed(0)}%`
const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`

async function main(): Promise<void> {
const n = Number(process.env.N ?? 16)
const holdoutN = Number(process.env.HOLDOUT ?? 4)
const kFacts = Number(process.env.K_FACTS ?? 3)
const maxShots = Number(process.env.MAXSHOTS ?? 3)
const model = process.env.WORKER_MODEL ?? 'deepseek-v4-pro'
const corpusPath = process.env.CORPUS ?? `/tmp/eops-corpus-ab-${Date.now()}.jsonl`
const opts: AgenticOptions = {
routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
routerKey: must('TANGLE_API_KEY'),
model,
innerTurns: Number(process.env.INNER_TURNS ?? 4),
temperature: 0.7,
}
const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR'))
const corpus = new FileCorpus(corpusPath)

const stream = await loadItsmTasks(n)
console.error(`=== corpus A/B · primed-vs-cold · stream n=${stream.length} + holdout ${holdoutN} · ${model} · k=${kFacts} facts ===`)
console.error(` corpus: ${corpusPath}\n`)

/** Top-k trace-derived facts → a prime block for the worker's system prompt. */
async function primeBlock(): Promise<{ text: string; count: number }> {
const facts = await corpus.query({ tags: ['audience:agent'], limit: kFacts })
if (facts.length === 0) return { text: '', count: 0 }
const lines = facts.map((f) => `- ${f.claim}${f.rationale ? ` (${f.rationale.slice(0, 120)})` : ''}`)
return {
text: `\n\nLEARNINGS FROM PRIOR RUNS (apply where relevant):\n${lines.join('\n')}`,
count: facts.length,
}
}

async function runArm(task: AgenticTask, primed: boolean): Promise<{ score: number; facts: number } | null> {
try {
if (!primed) {
const r = await runAgentic({ ...opts, surface, task, mode: 'depth', budget: maxShots })
return { score: r.score, facts: 0 }
}
const prime = await primeBlock()
const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` }
const r = await runAgentic({ ...opts, corpus, corpusTags: tags, surface, task: primedTask, mode: 'depth', budget: maxShots })
return { score: r.score, facts: prime.count }
} catch (e) {
console.error(` SKIP ${task.id.slice(-12)} (${e instanceof Error ? e.message.slice(0, 60) : e})`)
return null
}
}

// The stream: per task, cold first (no corpus contact), then primed (reads + writes).
const rows: Array<{ cold: number; primed: number; facts: number }> = []
for (let i = 0; i < stream.length; i += 1) {
const task = stream[i] as AgenticTask
const cold = await runArm(task, false)
const primed = await runArm(task, true)
if (!cold || !primed) continue
rows.push({ cold: cold.score, primed: primed.score, facts: primed.facts })
console.error(` [${i + 1}/${stream.length}] ${task.id.slice(-12)}: cold ${pct(cold.score)} primed ${pct(primed.score)} (facts injected: ${primed.facts})`)
}

const mean = (xs: number[]) => (xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0)
const lift = pairedLift(rows.map((r) => r.cold), rows.map((r) => r.primed))
const half = Math.floor(rows.length / 2)
const firstHalf = mean(rows.slice(0, half).map((r) => r.primed - r.cold))
const secondHalf = mean(rows.slice(half).map((r) => r.primed - r.cold))

// Frozen holdout: fresh tasks, primed from the ACCUMULATED corpus (read-only) vs cold.
let holdout: { lift: PairedLift; n: number } | undefined
if (holdoutN > 0) {
console.error(`\n▶ holdout (${holdoutN} disjoint tasks, corpus read-only)…`)
const htasks = await loadItsmTasks(holdoutN, stream.length)
const hrows: Array<{ cold: number; primed: number }> = []
for (const task of htasks) {
const cold = await runArm(task, false)
if (!cold) continue
// read-only priming: query + inject, but do NOT pass the corpus (no writes).
try {
const prime = await primeBlock()
const primedTask: AgenticTask = { ...task, systemPrompt: `${task.systemPrompt}${prime.text}` }
const r = await runAgentic({ ...opts, surface, task: primedTask, mode: 'depth', budget: maxShots })
hrows.push({ cold: cold.score, primed: r.score })
console.error(` ${task.id.slice(-12)}: cold ${pct(cold.score)} primed ${pct(r.score)}`)
} catch {
/* skip */
}
}
if (hrows.length >= 2) holdout = { lift: pairedLift(hrows.map((r) => r.cold), hrows.map((r) => r.primed)), n: hrows.length }
}

const sig = (l: PairedLift) => (l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.')
console.error(`\n${'='.repeat(74)}`)
console.error(`CORPUS A/B RESULT · stream n=${rows.length} · ${model} · k=${kFacts}`)
console.error('='.repeat(74))
console.error(` cold ${pct(mean(rows.map((r) => r.cold)))} primed ${pct(mean(rows.map((r) => r.primed)))}`)
console.error(` primed − cold (paired, B=10000) ${pp(lift.point)} CI [${pp(lift.low)}, ${pp(lift.high)}] disc=${lift.discordant} ${sig(lift)}`)
console.error(` SLOPE: first-half lift ${pp(firstHalf)} → second-half lift ${pp(secondHalf)} ${secondHalf > firstHalf ? '(growing — the flywheel signature)' : '(not growing)'}`)
if (holdout) console.error(` HOLDOUT (${holdout.n} fresh tasks, accumulated corpus): ${pp(holdout.lift.point)} CI [${pp(holdout.lift.low)}, ${pp(holdout.lift.high)}] ${sig(holdout.lift)}`)
console.error(` corpus facts accumulated: see ${corpusPath}`)
}

main().catch((e) => {
console.error(`eops-corpus-ab: ${e instanceof Error ? (e.stack ?? e.message) : String(e)}`)
process.exit(1)
})
3 changes: 1 addition & 2 deletions bench/src/eops-gepa.mts
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
*/
import { readFileSync, writeFileSync } from 'node:fs'
import { buildReflectionPrompt, paretoFrontier, parseReflectionResponse } from '@tangle-network/agent-eval'
import { defaultAnalystInstruction } from '@tangle-network/agent-runtime/loops'
import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
import { type AgenticOptions, type AgenticTask, defaultAnalystInstruction, runAgentic } from '@tangle-network/agent-runtime/loops'
import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
import { type RouterConfig, routerChatWithUsage } from './router-client'

Expand Down
3 changes: 1 addition & 2 deletions bench/src/examples/strategy-demo.mts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
* 2. pick strategies — pass [sample, refine, adaptiveRefine].
* 3. author your own — defineStrategy(name, body) in ~10 lines, no Supervisor ceremony.
*/
import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, refine, sample } from '../agentic'
import { type Environment, printBenchmarkReport, runBenchmark } from '../run-benchmark.mts'
import { adaptiveRefine, type AgenticTask, type ArtifactHandle, defineStrategy, type Environment, printBenchmarkReport, refine, runBenchmark, sample } from '@tangle-network/agent-runtime/loops'

// ── 1. Implement an Environment (the only thing a new domain writes) ──────────────
// A toy: the agent must drive a counter to exactly the target using the increment tool.
Expand Down
33 changes: 33 additions & 0 deletions src/runtime/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ export type {
WidenSpec,
} from './personify/wave-types'
export { reportLoopUsage, type UsageSink } from './report-usage'
export {
type BenchmarkConfig,
type BenchmarkLift,
type BenchmarkReport,
type Environment,
printBenchmarkReport,
runBenchmark,
} from './run-benchmark'
export type { RunLoopOptions } from './run-loop'
export { createSandboxForSpec, defaultSelectWinner, runLoop } from './run-loop'
export { acquireSandbox } from './sandbox-acquire'
Expand All @@ -176,6 +184,31 @@ export {
type SandboxRun,
type TurnResult,
} from './sandbox-run'
// The optimization suite: a domain = an Environment (5 hooks); a Strategy = how the
// budget is spent to beat its check. Built-ins `sample`/`refine`; author your own with
// `defineStrategy` (compose shot() + critique(), zero Supervisor ceremony); compare
// with runBenchmark. The depth/breadth drivers are the reference implementations.
export {
type AgenticOptions,
type AgenticRunResult,
type AgenticSurface,
type AgenticTask,
type AgenticTool,
type ArtifactHandle,
adaptiveRefine,
breadthDriver,
defineStrategy,
depthDriver,
type RunAgenticOptions,
refine,
runAgentic,
type ShotSpec,
type Strategy,
type StrategyCtx,
type StrategyResult,
type SurfaceScore,
sample,
} from './strategy'
export {
type BudgetPool,
type BudgetReadout,
Expand Down
108 changes: 74 additions & 34 deletions bench/src/run-benchmark.mts → src/runtime/run-benchmark.ts
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
/**
* runBenchmark — the packaged optimization suite. Define a domain by implementing an
* Environment (open / tools / call / score / close); get the optimization strategies
* compared, scored by your own deployable check, with a paired-bootstrap report — FREE.
* `Environment` (open / tools / call / score / close); get the optimization strategies
* compared, scored by your own deployable check, with a paired-bootstrap report — free.
*
* The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. An
* optimization STRATEGY is how you spend the budget to beat the check. Two primitives:
* The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
* is how you spend the budget to beat the check. Two built-ins:
*
* sample — N independent attempts, keep the best-verifying one. ("best-of-N" / resample)
* sample — N independent attempts, keep the best-verifying one. (best-of-N / resample)
* refine — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
*
* Both run at equal budget; the headline is the paired lift of refine over sample.
* (Internally `sample`→breadth, `refine`→depth on the canonical Supervisor+observe loop.)
*
* Juniors call runBenchmark and read the report. Seniors customize the HOOKS: the critic
* (worker.analystInstruction — observe()'s prompt), the check (Environment.score), the
* worker (the model), and can drop to runAgentic / the Supervisor for new strategies.
* Both run at equal budget through the Supervisor's conserved pool; the headline is the
* paired lift of refine over sample. Author your own strategy with `defineStrategy`.
*/
import { type AgenticOptions, type AgenticSurface, type AgenticTask, refine, runAgentic, sample, type Strategy } from './agentic'
import { type PairedLift, pairedLift, pool } from './stats.mts'

import { pairedBootstrap } from '@tangle-network/agent-eval'
import {
type AgenticOptions,
type AgenticSurface,
type AgenticTask,
refine,
runAgentic,
type Strategy,
sample,
} from './strategy'

/** A checkable task domain — implement these 5 hooks and the suite does the rest. The
* same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
export type Environment = AgenticSurface

// Strategy is the OPEN extension point (re-exported from agentic): pass the built-ins or
// author your own (implement Strategy.driver returning an Agent). See `refine`/`sample`.
export { refine, sample, type Strategy } from './agentic'

export interface BenchmarkConfig {
/** The task domain (5 hooks). */
environment: Environment
Expand All @@ -43,17 +44,47 @@ export interface BenchmarkConfig {
concurrency?: number
}

export interface BenchmarkLift {
/** Mean of paired deltas (refine − sample). */
mean: number
low: number
high: number
n: number
}

export interface BenchmarkReport {
n: number
excluded: number
/** Mean verifier score per strategy (keyed by strategy.name, 0..1). */
perStrategy: Record<string, number>
/** The headline when exactly `refine` + `sample` ran: paired lift of refine over sample. */
refineVsSample?: PairedLift
/** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
refineVsSample?: BenchmarkLift
}

/** Run the requested strategies over the tasks, scored by the Environment's own check,
* and return the per-strategy means + the paired-bootstrap lift of refine over sample.
/** Bounded-concurrency map preserving order; a worker that throws resolves its slot to null. */
async function pool<T, R>(
items: readonly T[],
limit: number,
fn: (item: T, i: number) => Promise<R | null>,
): Promise<Array<R | null>> {
const out: Array<R | null> = new Array(items.length).fill(null)
let next = 0
const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, async () => {
while (next < items.length) {
const i = next
next += 1
try {
out[i] = await fn(items[i] as T, i)
} catch {
out[i] = null
}
}
})
await Promise.all(workers)
return out
}

/** Run the requested strategies over the tasks, scored by the Environment's own check.
* Resilient: a task whose rollouts fail (transient infra) is excluded, not fatal. */
export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport> {
const strategies = cfg.strategies ?? [sample, refine]
Expand All @@ -62,15 +93,17 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor

const rows = await pool(cfg.tasks, concurrency, async (task) => {
const scores: Record<string, number> = {}
try {
for (const s of strategies) {
const r = await runAgentic({ ...cfg.worker, surface: cfg.environment, task, strategy: s, budget })
scores[s.name] = r.score
}
return scores
} catch {
return null // transient infra on this task — exclude it
for (const s of strategies) {
const r = await runAgentic({
...cfg.worker,
surface: cfg.environment,
task,
strategy: s,
budget,
})
scores[s.name] = r.score
}
return scores
})

const ok = rows.filter((r): r is Record<string, number> => r !== null)
Expand All @@ -80,8 +113,12 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor

const report: BenchmarkReport = { n: ok.length, excluded: rows.length - ok.length, perStrategy }
const names = strategies.map((s) => s.name)
if (names.includes('refine') && names.includes('sample')) {
report.refineVsSample = pairedLift(ok.map((r) => r.sample ?? 0), ok.map((r) => r.refine ?? 0))
if (names.includes('refine') && names.includes('sample') && ok.length >= 2) {
const b = pairedBootstrap(
ok.map((r) => r.sample ?? 0),
ok.map((r) => r.refine ?? 0),
)
report.refineVsSample = { mean: b.mean, low: b.low, high: b.high, n: b.n }
}
return report
}
Expand All @@ -90,11 +127,14 @@ export async function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkRepor
export function printBenchmarkReport(report: BenchmarkReport): void {
const pct = (x: number) => `${(x * 100).toFixed(1)}%`
const pp = (x: number) => `${x >= 0 ? '+' : ''}${(x * 100).toFixed(1)}pp`
console.log(`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`)
for (const [s, v] of Object.entries(report.perStrategy)) console.log(` ${s.padEnd(8)} ${pct(v ?? 0)}`)
console.log(
`\n=== benchmark · n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ''} ===`,
)
for (const [s, v] of Object.entries(report.perStrategy))
console.log(` ${s.padEnd(8)} ${pct(v ?? 0)}`)
const l = report.refineVsSample
if (l) {
const sig = l.low > 0 ? 'SIGNIF +' : l.high < 0 ? 'SIGNIF -' : 'n.s.'
console.log(` refine − sample: ${pp(l.point)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`)
console.log(` refine − sample: ${pp(l.mean)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`)
}
}
Loading
Loading