diff --git a/src/commands/manifest/bazel/bazel-workspace-walk.mts b/src/commands/manifest/bazel/bazel-workspace-walk.mts new file mode 100644 index 000000000..8f7d95654 --- /dev/null +++ b/src/commands/manifest/bazel/bazel-workspace-walk.mts @@ -0,0 +1,125 @@ +/** + * Walk the directory tree rooted at `cwd` and return every directory that + * looks like a Bazel workspace root — i.e. contains `MODULE.bazel`, + * `WORKSPACE`, or `WORKSPACE.bazel`. Real monorepos host multiple roots + * (e.g. `envoy/mobile/MODULE.bazel`, rules_kotlin's per-example + * `examples//MODULE.bazel`); the per-workspace algorithm in the + * orchestrator runs once per discovered root. + * + * The walker is dependency-injected with the directory-prune policy: + * callers pass the set of basenames and basename prefixes the walk must + * refuse to descend into. This module intentionally hardcodes none of + * the "common" prunes (`.git`, `node_modules`, …) — Bazel callers compose + * the codebase-wide `IGNORED_DIRS` list (`src/utils/glob.mts`) with the + * Bazel-specific bits (`bazel-*` output_base symlinks, + * `.socket-auto-manifest`, build-output `dist*`). + */ + +import { readdirSync } from 'node:fs' +import path from 'node:path' + +import { logger } from '@socketsecurity/registry/lib/logger' + +// Hard ceiling on number of workspace roots we will surface. Real monorepos +// have well under 50; this cap is a guard against pathological inputs. +const MAX_WORKSPACE_ROOTS = 256 +// Hard ceiling on directory walk depth. Deepest workspace marker observed +// across the OSS corpus surveyed is 9 (bazel-self test fixtures); deepest +// in realistic application code is 7 (checkmk's thirdparty layout). Cap +// is set to 8 — one level of headroom over the realistic max, while still +// guarding against pathological symlink loops that slipped past any +// prefix prune. +const MAX_WALK_DEPTH = 8 +// Files whose presence promotes a directory to a workspace root. +const WORKSPACE_MARKER_FILES = new Set([ + 'MODULE.bazel', + 'WORKSPACE', + 'WORKSPACE.bazel', +]) + +export type FindWorkspaceRootsOptions = { + cwd: string + // Directory basenames to skip outright (exact match). Pass the union of + // the codebase-wide ignore set (`IGNORED_DIRS` in `src/utils/glob.mts`) + // and any caller-specific additions (e.g. `.socket-auto-manifest`). + ignoreDirNames?: ReadonlySet + // Directory basename prefixes to skip. Bazel callers pass `['bazel-', + // 'dist']` so the walk never descends into Bazel's output_base symlinks + // or build-output directories. + ignoreDirPrefixes?: readonly string[] + verbose?: boolean +} + +const EMPTY_SET: ReadonlySet = new Set() +const EMPTY_ARRAY: readonly string[] = [] + +// Walks the tree rooted at `opts.cwd` and returns absolute paths to every +// directory that contains at least one workspace marker file. Output is +// sorted for determinism. +export function findWorkspaceRoots(opts: FindWorkspaceRootsOptions): string[] { + const { cwd, verbose } = opts + const ignoreDirNames = opts.ignoreDirNames ?? EMPTY_SET + const ignoreDirPrefixes = opts.ignoreDirPrefixes ?? EMPTY_ARRAY + const out: string[] = [] + // Tuple stack: [absolute dir, depth from cwd]. + const stack: Array<[string, number]> = [[cwd, 0]] + while (stack.length) { + if (out.length >= MAX_WORKSPACE_ROOTS) { + if (verbose) { + logger.log( + `[VERBOSE] workspace walker: hit MAX_WORKSPACE_ROOTS cap (${MAX_WORKSPACE_ROOTS}); truncating walk`, + ) + } + break + } + const next = stack.pop() + if (!next) { + break + } + const { 0: dir, 1: depth } = next + let entries + try { + entries = readdirSync(dir, { withFileTypes: true }) + } catch { + continue + } + // First pass: detect whether this dir is itself a workspace root. + let isWorkspaceRoot = false + for (const entry of entries) { + if (entry.isFile() && WORKSPACE_MARKER_FILES.has(entry.name)) { + isWorkspaceRoot = true + break + } + } + if (isWorkspaceRoot) { + out.push(dir) + } + // Second pass: schedule descents. We descend regardless of whether the + // current dir is itself a root — nested workspaces are common in + // monorepos (root MODULE.bazel + examples/*/MODULE.bazel). + if (depth + 1 > MAX_WALK_DEPTH) { + continue + } + for (const entry of entries) { + if (!entry.isDirectory()) { + continue + } + const name = entry.name + if (ignoreDirNames.has(name)) { + continue + } + let pruned = false + for (const prefix of ignoreDirPrefixes) { + if (name.startsWith(prefix)) { + pruned = true + break + } + } + if (pruned) { + continue + } + stack.push([path.join(dir, name), depth + 1]) + } + } + return out.sort() +} diff --git a/src/commands/manifest/bazel/bazel-workspace-walk.test.mts b/src/commands/manifest/bazel/bazel-workspace-walk.test.mts new file mode 100644 index 000000000..202f48c01 --- /dev/null +++ b/src/commands/manifest/bazel/bazel-workspace-walk.test.mts @@ -0,0 +1,154 @@ +import { + mkdirSync, + mkdtempSync, + rmSync, + symlinkSync, + writeFileSync, +} from 'node:fs' +import os from 'node:os' +import path from 'node:path' + +import { afterEach, beforeEach, describe, expect, it } from 'vitest' + +import { findWorkspaceRoots } from './bazel-workspace-walk.mts' + +function touch(file: string): void { + mkdirSync(path.dirname(file), { recursive: true }) + writeFileSync(file, '') +} + +// Standard prune set Bazel callers pass: the codebase-wide IGNORED_DIRS +// (.git, node_modules, etc.) plus the walker's own output dir, plus +// `bazel-*` output_base symlinks and `dist*` build outputs. Replicated +// inline here so the test stays decoupled from `src/utils/glob.mts`. +const BAZEL_IGNORE_NAMES: ReadonlySet = new Set([ + '.git', + '.hg', + '.idea', + '.pnpm-store', + '.socket-auto-manifest', + '.svn', + '.vscode', + 'node_modules', +]) +const BAZEL_IGNORE_PREFIXES: readonly string[] = ['bazel-', 'dist'] + +describe('bazel-workspace-walk', () => { + let tmp: string + + beforeEach(() => { + tmp = mkdtempSync(path.join(os.tmpdir(), 'sock-bazel-walk-')) + }) + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }) + }) + + describe('findWorkspaceRoots', () => { + it('returns the root when only the root has MODULE.bazel', () => { + touch(path.join(tmp, 'MODULE.bazel')) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp]) + }) + + it('detects WORKSPACE and WORKSPACE.bazel as root markers', () => { + touch(path.join(tmp, 'WORKSPACE')) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp]) + rmSync(path.join(tmp, 'WORKSPACE')) + touch(path.join(tmp, 'WORKSPACE.bazel')) + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([tmp]) + }) + + it('finds nested workspaces at arbitrary depth', () => { + touch(path.join(tmp, 'MODULE.bazel')) + touch(path.join(tmp, 'examples', 'dagger', 'MODULE.bazel')) + touch(path.join(tmp, 'examples', 'android', 'nested', 'WORKSPACE.bazel')) + const found = findWorkspaceRoots({ cwd: tmp }).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual(['', 'examples/android/nested', 'examples/dagger']) + }) + + it('returns [] when there is no workspace root', () => { + writeFileSync(path.join(tmp, 'README.md'), '') + expect(findWorkspaceRoots({ cwd: tmp })).toEqual([]) + }) + + it('does NOT prune by default — pruning policy is caller-supplied', () => { + // No ignoreDirNames / ignoreDirPrefixes passed in: any sub-directory + // containing a workspace marker gets surfaced, even node_modules. + touch(path.join(tmp, 'MODULE.bazel')) + touch(path.join(tmp, 'node_modules', 'MODULE.bazel')) + const found = findWorkspaceRoots({ cwd: tmp }).map(p => + path.relative(tmp, p), + ) + expect(found).toEqual(['', 'node_modules']) + }) + + it('prunes injected ignoreDirNames', () => { + touch(path.join(tmp, 'MODULE.bazel')) + for (const dir of ['node_modules', '.git', '.socket-auto-manifest']) { + touch(path.join(tmp, dir, 'sub', 'MODULE.bazel')) + } + const found = findWorkspaceRoots({ + cwd: tmp, + ignoreDirNames: BAZEL_IGNORE_NAMES, + }).map(p => path.relative(tmp, p)) + expect(found).toEqual(['']) + }) + + it('prunes injected ignoreDirPrefixes (bazel-* symlinks)', () => { + // Simulate `bazel-out` pointing at a directory that contains a copy of + // MODULE.bazel. With the `bazel-` prefix injected, the walk must skip + // it; otherwise the walker would surface workspaces from . + const fakeOutputBase = mkdtempSync( + path.join(os.tmpdir(), 'sock-fake-outbase-'), + ) + try { + mkdirSync(path.join(fakeOutputBase, 'external', 'maven'), { + recursive: true, + }) + touch(path.join(fakeOutputBase, 'external', 'maven', 'MODULE.bazel')) + symlinkSync(fakeOutputBase, path.join(tmp, 'bazel-out')) + touch(path.join(tmp, 'MODULE.bazel')) + const found = findWorkspaceRoots({ + cwd: tmp, + ignoreDirPrefixes: BAZEL_IGNORE_PREFIXES, + }).map(p => path.relative(tmp, p)) + expect(found).toEqual(['']) + } finally { + rmSync(fakeOutputBase, { recursive: true, force: true }) + } + }) + + it('prunes injected dist* prefix', () => { + touch(path.join(tmp, 'MODULE.bazel')) + touch(path.join(tmp, 'dist', 'MODULE.bazel')) + touch(path.join(tmp, 'distribution', 'MODULE.bazel')) + const found = findWorkspaceRoots({ + cwd: tmp, + ignoreDirPrefixes: BAZEL_IGNORE_PREFIXES, + }).map(p => path.relative(tmp, p)) + expect(found).toEqual(['']) + }) + + it('returns absolute, sorted paths', () => { + touch(path.join(tmp, 'z', 'MODULE.bazel')) + touch(path.join(tmp, 'a', 'MODULE.bazel')) + touch(path.join(tmp, 'm', 'MODULE.bazel')) + const found = findWorkspaceRoots({ cwd: tmp }) + expect(found).toEqual([ + path.join(tmp, 'a'), + path.join(tmp, 'm'), + path.join(tmp, 'z'), + ]) + for (const p of found) { + expect(path.isAbsolute(p)).toBe(true) + } + }) + + it('handles an unreadable directory by skipping it (no throw)', () => { + touch(path.join(tmp, 'MODULE.bazel')) + expect(findWorkspaceRoots({ cwd: path.join(tmp, 'nope') })).toEqual([]) + }) + }) +})