tangle-network · drewstone · Jun 9, 2026 · Jun 9, 2026
diff --git a/bench/src/agentic-eops.ts b/bench/src/agentic-eops.ts
@@ -0,0 +1,73 @@
+/**
+ * EnterpriseOps as an `AgenticSurface` — the EOPS instance of the general agentic primitive.
+ *
+ * This is ALL the EOPS-specific code: open = seed an isolated gym DB; tools = the gym's MCP tool
+ * schemas; call = an MCP tools/call; score = the deterministic SQL verifiers; close = delete the DB.
+ * Reuses the primitives proven against the live container in gym-agent.ts. A new domain (Commit0,
+ * AppWorld, terminal-bench) ships its own file like this one — the drivers in agentic.ts never change.
+ */
+
+import type { AgenticSurface, AgenticTask, AgenticTool, ArtifactHandle, SurfaceScore } from './agentic'
+import { callTool, deleteDb, type GymServer, type GymVerifier, loadTools, runVerifiers, seed } from './gym-agent'
+
+interface EopsMeta {
+  servers: GymServer[]
+  verifiers: GymVerifier[]
+  selectedTools: string[]
+}
+
+function metaOf(task: AgenticTask): EopsMeta {
+  const m = task.meta as EopsMeta | undefined
+  if (!m || !Array.isArray(m.servers) || !Array.isArray(m.verifiers)) {
+    throw new Error(`eops surface: task ${task.id} missing meta.servers/verifiers`)
+  }
+  return m
+}
+
+/** Build an `AgenticTask` from an EnterpriseOps HF row (the EOPS-shaped fields go in `meta`). */
+export function eopsTaskFromRow(row: {
+  task_id: string
+  system_prompt: string
+  user_prompt: string
+  selected_tools: string[]
+  gym_servers_config: string | GymServer[]
+  verifiers: string | GymVerifier[]
+}): AgenticTask {
+  const arr = <T,>(v: string | T[]): T[] => (typeof v === 'string' ? (JSON.parse(v) as T[]) : v)
+  return {
+    id: row.task_id,
+    systemPrompt: row.system_prompt,
+    userPrompt: row.user_prompt,
+    meta: {
+      servers: arr<GymServer>(row.gym_servers_config),
+      verifiers: arr<GymVerifier>(row.verifiers),
+      selectedTools: row.selected_tools,
+    } satisfies EopsMeta,
+  }
+}
+
+/** The EnterpriseOps surface. `gymDbsDir` = the unzipped gym_dbs.zip (resolves seed_database_file). */
+export function createEopsSurface(gymDbsDir: string): AgenticSurface {
+  return {
+    name: 'eops',
+    async open(task: AgenticTask): Promise<ArtifactHandle> {
+      // Clone the server config and seed a fresh isolated DB (seed() stamps server._database_id).
+      const server: GymServer = { ...metaOf(task).servers[0]! }
+      await seed(server, gymDbsDir)
+      return { id: server._database_id ?? task.id, surface: 'eops', ctx: server }
+    },
+    async tools(task: AgenticTask, handle: ArtifactHandle): Promise<AgenticTool[]> {
+      return loadTools(handle.ctx as GymServer, metaOf(task).selectedTools)
+    },
+    call(handle: ArtifactHandle, name: string, args: Record<string, unknown>): Promise<string> {
+      return callTool(handle.ctx as GymServer, name, args)
+    },
+    async score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore> {
+      const server = handle.ctx as GymServer
+      return runVerifiers({ servers: [server], verifiers: metaOf(task).verifiers } as never)
+    },
+    close(handle: ArtifactHandle): Promise<void> {
+      return deleteDb(handle.ctx as GymServer)
+    },
+  }
+}
diff --git a/bench/src/agentic-run.mts b/bench/src/agentic-run.mts
@@ -0,0 +1,84 @@
+/**
+ * Run the general agentic primitive over EnterpriseOps — depth (sequential, same artifact) and
+ * breadth (parallel), both through the keystone Supervisor. Reports the depth progress-over-shots
+ * curve and a depth-vs-breadth comparison at matched compute.
+ *
+ *   export TANGLE_API_KEY=… EOPS_GYM_DBS_DIR=<unzipped gym_dbs.zip>   # itsm gym on :8006
+ *   TASKS=4 MAX_SHOTS=5 WIDTH=5 INNER_TURNS=4 WORKER_MODEL=gpt-4.1 tsx src/agentic-run.mts
+ */
+
+import { type AgenticOptions, type AgenticTask, runAgentic } from './agentic'
+import { createEopsSurface, eopsTaskFromRow } from './agentic-eops'
+
+const must = (k: string): string => {
+  const v = process.env[k]
+  if (!v) throw new Error(`env ${k} is required`)
+  return v
+}
+
+async function loadItsmTasks(n: number): Promise<AgenticTask[]> {
+  const url =
+    'https://datasets-server.huggingface.co/rows?dataset=ServiceNow-AI%2FEnterpriseOps-Gym' +
+    `&config=oracle&split=itsm&offset=0&length=${n}`
+  const res = await fetch(url)
+  if (!res.ok) throw new Error(`HF rows ${res.status}`)
+  const body = (await res.json()) as { rows?: Array<{ row: Parameters<typeof eopsTaskFromRow>[0] }> }
+  return (body.rows ?? []).slice(0, n).map(({ row }) => eopsTaskFromRow(row))
+}
+
+async function main(): Promise<void> {
+  const nTasks = Number(process.env.TASKS ?? 4)
+  const maxShots = Number(process.env.MAX_SHOTS ?? 5)
+  const width = Number(process.env.WIDTH ?? 5)
+  const opts: AgenticOptions = {
+    routerBaseUrl: process.env.ROUTER_BASE ?? 'https://router.tangle.tools/v1',
+    routerKey: must('TANGLE_API_KEY'),
+    model: process.env.WORKER_MODEL ?? 'gpt-4.1',
+    temperature: Number(process.env.TEMPERATURE ?? 0.7),
+    innerTurns: Number(process.env.INNER_TURNS ?? 4),
+  }
+  const surface = createEopsSurface(must('EOPS_GYM_DBS_DIR'))
+  const tasks = await loadItsmTasks(nTasks)
+  console.log(`agentic primitive over EOPS: ${tasks.length} itsm tasks, ${opts.model}, maxShots=${maxShots}, width=${width}, innerTurns=${opts.innerTurns}\n`)
+
+  const rows: Array<{ depth: number; breadth: number; cD: number; cB: number; prog: number[] }> = []
+  for (const [i, task] of tasks.entries()) {
+    let depth: Awaited<ReturnType<typeof runAgentic>>
+    try {
+      depth = await runAgentic({ ...opts, surface, task, mode: 'depth', budget: maxShots })
+    } catch (e) {
+      console.log(`  task ${i}: DEPTH failed — ${e instanceof Error ? e.message.slice(0, 110) : e}`)
+      continue
+    }
+    // Breadth at matched compute: widen until cumulative completions ≥ depth's.
+    let breadthScore = 0
+    let cB = 0
+    let w = 0
+    while (cB < depth.completions && w < width * 3) {
+      const step = Math.max(1, Math.min(width, Math.ceil((depth.completions - cB) / (opts.innerTurns ?? 4))))
+      const b = await runAgentic({ ...opts, surface, task, mode: 'breadth', budget: step })
+      cB += b.completions
+      w += step
+      if (b.score > breadthScore) breadthScore = b.score
+    }
+    rows.push({ depth: depth.score, breadth: breadthScore, cD: depth.completions, cB, prog: depth.progression })
+    console.log(
+      `  task ${i} ${task.id.slice(0, 24)}: DEPTH ${(depth.score * 100).toFixed(0)}% [progress ${depth.progression.map((s) => (s * 100).toFixed(0)).join('→')}] ` +
+        `${depth.completions} comp  vs  BREADTH ${(breadthScore * 100).toFixed(0)}% (best-of-${w}, ${cB} comp)`,
+    )
+  }
+
+  if (rows.length === 0) throw new Error('no scoreable tasks')
+  const mean = (f: (r: (typeof rows)[number]) => number) => rows.reduce((a, r) => a + f(r), 0) / rows.length
+  const d = mean((r) => r.depth)
+  const b = mean((r) => r.breadth)
+  console.log(`\n=== n=${rows.length}, equal compute (~${mean((r) => r.cD).toFixed(0)} vs ${mean((r) => r.cB).toFixed(0)} comp) ===`)
+  console.log(`DEPTH (continue, same artifact): ${(d * 100).toFixed(1)}%`)
+  console.log(`BREADTH (parallel best-of):      ${(b * 100).toFixed(1)}%`)
+  console.log(`VERDICT: depth ${d >= b ? 'BEATS' : 'loses to'} breadth by ${((d - b) * 100).toFixed(1)}pp at equal compute`)
+}
+
+main().catch((e) => {
+  console.error(e instanceof Error ? (e.stack ?? e.message) : String(e))
+  process.exit(1)
+})