Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions bench/src/agentic-eops.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
* EnterpriseOps as an `AgenticSurface` — the EOPS instance of the general agentic primitive.
*
* This is ALL the EOPS-specific code: open = seed an isolated gym DB; tools = the gym's MCP tool
* schemas; call = an MCP tools/call; score = the deterministic SQL verifiers; close = delete the DB.
* Reuses the primitives proven against the live container in gym-agent.ts. A new domain (Commit0,
* AppWorld, terminal-bench) ships its own file like this one — the drivers in agentic.ts never change.
*/

import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from './agentic'
import { callTool, deleteDb, type GymServer, type GymVerifier, loadTools, runVerifiers, seed } from './gym-agent'

interface EopsMeta {
servers: GymServer[]
verifiers: GymVerifier[]
selectedTools: string[]
}

function metaOf(task: AgenticTask): EopsMeta {
const m = task.meta as EopsMeta | undefined
if (!m || !Array.isArray(m.servers) || !Array.isArray(m.verifiers)) {
throw new Error(`eops surface: task ${task.id} missing meta.servers/verifiers`)
}
return m
}

/** Build an `AgenticTask` from an EnterpriseOps HF row (the EOPS-shaped fields go in `meta`). */
export function eopsTaskFromRow(row: {
task_id: string
system_prompt: string
user_prompt: string
selected_tools: string[]
gym_servers_config: string | GymServer[]
verifiers: string | GymVerifier[]
}): AgenticTask {
const arr = <T,>(v: string | T[]): T[] => (typeof v === 'string' ? (JSON.parse(v) as T[]) : v)
return {
id: row.task_id,
systemPrompt: row.system_prompt,
userPrompt: row.user_prompt,
meta: {
servers: arr<GymServer>(row.gym_servers_config),
verifiers: arr<GymVerifier>(row.verifiers),
selectedTools: row.selected_tools,
} satisfies EopsMeta,
}
}

/** The EnterpriseOps surface. `gymDbsDir` = the unzipped gym_dbs.zip (resolves seed_database_file). */
export function createEopsSurface(gymDbsDir: string): AgenticSurface {
return {
name: 'eops',
async open(task: AgenticTask): Promise<ArtifactHandle> {
// Clone the server config and seed a fresh isolated DB (seed() stamps server._database_id).
const server: GymServer = { ...metaOf(task).servers[0]! }
await seed(server, gymDbsDir)
return { id: server._database_id ?? task.id, surface: 'eops', ctx: server }
},
async tools(task: AgenticTask, handle: ArtifactHandle): Promise<AgenticTool[]> {
return loadTools(handle.ctx as GymServer, metaOf(task).selectedTools)
},
call(handle: ArtifactHandle, name: string, args: Record<string, unknown>): Promise<string> {
return callTool(handle.ctx as GymServer, name, args)
},
async score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore> {
const server = handle.ctx as GymServer
return runVerifiers({ servers: [server], verifiers: metaOf(task).verifiers } as never)
},
close(handle: ArtifactHandle): Promise<void> {
return deleteDb(handle.ctx as GymServer)
},
}
}
84 changes: 84 additions & 0 deletions bench/src/agentic-run.mts
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/**
* Run the general agentic primitive over EnterpriseOps — depth (sequential, same artifact) and
* breadth (parallel), both through the keystone Supervisor. Reports the depth progress-over-shots
* curve and a depth-vs-breadth comparison at matched compute.
*
* export TANGLE_API_KEY=… EOPS_GYM_DBS_DIR=<unzipped gym_dbs.zip> # itsm gym on :8006
* TASKS=4 MAX_SHOTS=5 WIDTH=5 INNER_TURNS=4 WORKER_MODEL=gpt-4.1 tsx src/agentic-run.mts
*/

import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'

const must = (k: string): string => {
const v = process.env[k]
if (!v) throw new Error(`env ${k} is required`)
return v
}

async function loadItsmTasks(n: number): Promise<AgenticTask[]> {
const url =
'https://datasets-server.huggingface.co/rows?dataset=ServiceNow-AI%2FEnterpriseOps-Gym' +
`&config=oracle&split=itsm&offset=0&length=${n}`
const res = await fetch(url)
if (!res.ok) throw new Error(`HF rows ${res.status}`)
const body = (await res.json()) as { rows?: Array<{ row: Parameters<typeof eopsTaskFromRow>[0] }> }
return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row))
}

async function main(): Promise<void> {
const nTasks = Number(process.env.TASKS ?? 4)
const maxShots = Number(process.env.MAX_SHOTS ?? 5)
const width = Number(process.env.WIDTH ?? 5)
const opts: AgenticOptions = {
routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
routerKey: must('TANGLE_API_KEY'),
model: process.env.WORKER_MODEL ?? 'gpt-4.1',
temperature: Number(process.env.TEMPERATURE ?? 0.7),
innerTurns: Number(process.env.INNER_TURNS ?? 4),
}
const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR'))
const tasks = await loadItsmTasks(nTasks)
console.log(`agentic primitive over EOPS: ${tasks.length} itsm tasks, ${opts.model}, maxShots=${maxShots}, width=${width}, innerTurns=${opts.innerTurns}\n`)

const rows: Array<{ depth: number; breadth: number; cD: number; cB: number; prog: number[] }> = []
for (const [i, task] of tasks.entries()) {
let depth: Awaited<ReturnType<typeof runAgentic>>
try {
depth = await runAgentic({ ...opts, surface, task, mode: 'depth', budget: maxShots })
} catch (e) {
console.log(` task ${i}: DEPTH failed — ${e instanceof Error ? e.message.slice(0, 110) : e}`)
continue
}
// Breadth at matched compute: widen until cumulative completions ≥ depth's.
let breadthScore = 0
let cB = 0
let w = 0
while (cB < depth.completions && w < width * 3) {
const step = Math.max(1, Math.min(width, Math.ceil((depth.completions - cB) / (opts.innerTurns ?? 4))))
const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: step })
cB += b.completions
w += step
if (b.score > breadthScore) breadthScore = b.score
}
rows.push({ depth: depth.score, breadth: breadthScore, cD: depth.completions, cB, prog: depth.progression })
console.log(
` task ${i} ${task.id.slice(0, 24)}: DEPTH ${(depth.score * 100).toFixed(0)}% [progress ${depth.progression.map((s) => (s * 100).toFixed(0)).join('→')}] ` +
`${depth.completions} comp vs BREADTH ${(breadthScore * 100).toFixed(0)}% (best-of-${w}, ${cB} comp)`,
)
}

if (rows.length === 0) throw new Error('no scoreable tasks')
const mean = (f: (r: (typeof rows)[number]) => number) => rows.reduce((a, r) => a + f(r), 0) / rows.length
const d = mean((r) => r.depth)
const b = mean((r) => r.breadth)
console.log(`\n=== n=${rows.length}, equal compute (~${mean((r) => r.cD).toFixed(0)} vs ${mean((r) => r.cB).toFixed(0)} comp) ===`)
console.log(`DEPTH (continue, same artifact): ${(d * 100).toFixed(1)}%`)
console.log(`BREADTH (parallel best-of): ${(b * 100).toFixed(1)}%`)
console.log(`VERDICT: depth ${d >= b ? 'BEATS' : 'loses to'} breadth by ${((d - b) * 100).toFixed(1)}pp at equal compute`)
}

main().catch((e) => {
console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
process.exit(1)
})
Loading
Loading