From 0ba6e8626bf5873ac7e3f9eb04e14fcac5a1f459 Mon Sep 17 00:00:00 2001 From: Connor McDonald Date: Wed, 1 Jul 2026 18:59:51 +0200 Subject: [PATCH] feat(okf): OKF-conformant hubs + freshness provenance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make Surface hubs a superset of an Open Knowledge Format (OKF) concept, so every hub is a conformant OKF document that any OKF consumer can read, while Surface adds the freshness OKF deliberately omits. - Frontmatter carries OKF `type` (defaulted to `concept`, existing hubs byte-unchanged), `title`/`tags`/`timestamp`, and preserves any other key verbatim in `extra` (the OKF preserve-unknown-keys rule). Relaxed `deny_unknown_fields` on the frontmatter only; anchor items stay strict. - `surf verify` records the freshness OKF lacks: a stable per-claim `id` (written once, never regenerated) plus `verified_at`/`verified_commit`, stamped only when the hash changes so a no-op verify stays byte-identical. The "who" is intentionally not stored (git blame has it) — no author emails in tracked files. Additive `id` on the JSON `Divergence`. - `bundles` glob governs an OKF bundle directory tree; reserved files (index.md/log.md) are classified and skipped, never blocking the gate. - `surf lint` warns (never blocks) on typo'd frontmatter keys, missing headlines, and dangling OKF cross-links. - Docs: new docs/guides/okf.md; README/index/configuration/CHANGELOG. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 30 ++++ README.md | 10 ++ docs/guides/okf.md | 104 ++++++++++++++ docs/index.md | 12 +- docs/reference/configuration.md | 36 +++++ hubs/cli-check.md | 39 ++++-- hubs/cli-workspace.md | 15 +- hubs/hub-format.md | 65 ++++++--- surf-cli/src/check.rs | 66 +++++++++ surf-cli/src/git.rs | 15 ++ surf-cli/src/lint.rs | 236 ++++++++++++++++++++++++++++++++ surf-cli/src/new.rs | 2 + surf-cli/src/verify.rs | 166 +++++++++++++++++++++- surf-cli/src/workspace.rs | 66 +++++++-- surf-core/src/config.rs | 19 ++- surf-core/src/hub.rs | 189 ++++++++++++++++++++++--- surf-core/src/lib.rs | 5 +- surf-core/src/report.rs | 4 + 18 files changed, 1009 insertions(+), 70 deletions(-) create mode 100644 docs/guides/okf.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 21c7a3b..0ff5a98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,36 @@ project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- **Hubs are now conformant Open Knowledge Format (OKF) concepts — "Surface = OKF + freshness."** A + hub's frontmatter is a superset of an [OKF](docs/guides/okf.md) concept: it carries OKF's `type` + (defaulted to `concept` so existing hubs are byte-unchanged and keep working), `title`, `tags`, + and `timestamp`, and **preserves any other frontmatter key verbatim** (OKF's `description`/ + `resource`, a doc system's `author`/`created`/`pinned`) in an `extra` map — the OKF rule that + consumers must not drop unknown keys. So a hub drops into any OKF consumer (Google's Knowledge + Catalog, the OKF visualizer, Obsidian, git-backed editors), which read the prose and ignore the + `anchors:` Surface governs. `surf new` scaffolds `type: concept`. +- **Freshness provenance and stable claim IDs.** `surf verify` now records the freshness OKF omits: + a stable per-claim `id` (written once, never regenerated, so it survives prose/anchor edits — the + substrate for claim timelines) plus `verified_at` and `verified_commit`. The *who* is deliberately + not stored (git blame on the hub records it) — keeping author emails out of tracked files. + Provenance is written **only when the hash actually changes**, so a no-op re-verify stays + byte-identical. The `--format json` `Divergence` gains an additive `id` field (no report-version bump). +- **OKF bundle layout.** `surf.toml` gains a `bundles` glob: each root is a directory *tree* of + concept files, expanded as `/**/*.md`. OKF reserved files (`index.md`, `log.md`) are + recognized and skipped for governance — they hold no claims and never block the gate. +- **`surf lint` OKF advisories.** A `Warn` (never a block) for an unknown frontmatter key that looks + like a typo of a known one (recovering the fail-closed signal below), a `Warn` when an anchored + hub has no headline (`summary`/`title`/`description`), and a `Warn` for a dangling OKF cross-link + in a hub body (OKF tolerates broken links, so this never blocks). + +### Changed +- **Frontmatter no longer rejects unknown keys** (OKF requires consumers to preserve them). This is + a deliberate, narrow relaxation of the gate's fail-closed posture: a mistyped *frontmatter* key + that previously hard-blocked now parses and earns a `surf lint` warning instead. Unknown keys + **inside an anchor item** still fail closed — anchor items are Surface's own data, not OKF's. +- `summary` is now optional (an OKF concept may carry only `description`). + ## [0.7.0] - 2026-06-29 ### Added diff --git a/README.md b/README.md index 07530ca..6192ade 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,15 @@ Surface's JSON output. The core never depends on it. More in [What Surface does NOT do](docs/index.md#what-surface-does-not-do) and [Is Surface for you?](docs/index.md#is-surface-for-you). +## Speaks OKF + +A hub is a conformant [Open Knowledge Format](docs/guides/okf.md) concept — Google's vendor-neutral +standard for knowledge as markdown + frontmatter. OKF standardizes how knowledge is written down but +deliberately omits freshness; that's exactly what Surface adds. **Surface = OKF + the freshness OKF +leaves out.** Your hubs drop into any OKF consumer (Knowledge Catalog, the OKF visualizer, Obsidian, +git-backed doc editors), which read the prose and ignore the `anchors:` Surface governs. See +[Surface and OKF](docs/guides/okf.md). + ## Install Most repos never install the binary — they run the GitHub Action: @@ -163,6 +172,7 @@ Full docs at **[surface.gradientdev.xyz](https://surface.gradientdev.xyz)** (sou - [Quickstart](docs/getting-started/quickstart.md) · [Install](docs/getting-started/install.md) - [Authoring hubs](docs/guides/authoring-hubs.md) — claims, anchor grammar, granularity, the verify loop. +- [Surface and OKF](docs/guides/okf.md) — hubs as conformant Open Knowledge Format concepts. - [CI integration](docs/guides/ci-integration.md) — the Action, the pre-commit hook, scoping a PR. - [Examples](docs/examples.md) — a minimal hub in each supported language. - Reference: [Commands](docs/reference/commands.md) · [Configuration](docs/reference/configuration.md) · [How the gate works](docs/reference/how-it-works.md) · [FAQ](docs/reference/faq.md) diff --git a/docs/guides/okf.md b/docs/guides/okf.md new file mode 100644 index 0000000..551d213 --- /dev/null +++ b/docs/guides/okf.md @@ -0,0 +1,104 @@ +--- +title: Surface and the Open Knowledge Format +description: A Surface hub is a conformant OKF concept; Surface adds the freshness OKF deliberately leaves out. Produce, consume, and govern OKF bundles. +--- + +Google's [Open Knowledge Format](https://github.com/GoogleCloudPlatform/knowledge-catalog/tree/main/okf) +(OKF) is a vendor-neutral standard for representing knowledge as a directory of markdown files with +YAML frontmatter. It standardizes *how knowledge is written down* so a wiki produced by one tool can +be consumed by another without translation. + +OKF is deliberately minimal. Its spec requires exactly one frontmatter field (`type`), recommends a +few more (`title`, `description`, `resource`, `tags`, `timestamp`), and mandates that consumers +**preserve unknown keys** and **never reject** a document for extra fields, unknown types, or broken +links. And it explicitly leaves several things undefined — most importantly, **freshness**: OKF has +no notion of "has the thing this describes changed?", no verification status, no anchoring to code, +no drift detection. + +That gap is exactly what Surface fills. + +## Surface = OKF + freshness + +> A Surface hub is a **conformant OKF concept**. Surface adds the freshness layer OKF omits. + +Because OKF ignores keys it doesn't recognize, Surface's governance metadata rides along inside an +otherwise-normal OKF concept: + +```markdown +--- +type: BigQuery Table # OKF: the one required field +title: Orders # OKF: display name +description: One row per order # OKF: preserved (Surface keeps it in `extra`) +tags: [sales, revenue] # OKF +timestamp: 2026-05-28T14:30:00Z # OKF: last *modified* +anchors: # Surface extension — OKF readers ignore it + - claim: an order is immutable once `status = shipped` + at: src/orders/model.ts > Order > freeze + hash: 2:9b1c33ade8f1 + id: c_18be… # stable identity (claim timelines) + verified_at: 2026-06-30T09:12:00Z # last *attested* against the code + verified_commit: 4d5e6f2 # who is recovered from git blame, not stored +--- + +# Orders + +Prose a human or agent reads to understand this table… +``` + +- **An OKF consumer** — Google's Knowledge Catalog, the OKF visualizer, [Nansidian](#doc-systems), + Obsidian — reads this as a normal concept and silently ignores `anchors`. +- **Surface** reads `anchors` and governs freshness: when `src/orders/model.ts > Order > freeze` + changes, [`surf check`](../reference/commands.md) fails until a human re-confirms the claim. +- The two timestamps are different on purpose: OKF's `timestamp` is *last modified*; Surface's + per-claim `verified_at` is *last attested against the code* — the freshness OKF can't express. + +## What conformance means here + +A hub is a conformant OKF concept when it carries a `type`. Surface makes this cheap: + +- `surf new` scaffolds hubs with `type: concept` already set. +- Hubs written before OKF (no `type`) still parse — Surface treats a missing `type` as `concept` + in memory. They are byte-unchanged on disk; run a future `surf migrate` (or add `type:` by hand) + to make them OKF-conformant *on disk*. +- Any extra frontmatter key (OKF's `description`/`resource`, a doc system's `author`/`created`/ + `pinned`) is preserved verbatim on round-trip — Surface never drops what it doesn't recognize. + +The one boundary worth stating plainly: + +> **Surface only fact-checks concepts that describe code.** A concept anchored to a code symbol is +> governed. A concept with no `anchors` (a BigQuery table's business meaning, an RFC, a playbook) +> is a valid, rendered, **ungoverned** OKF concept — it passes the gate untouched. Verifying +> non-code resources (e.g. a table's schema against the live warehouse) is future work; today the +> deterministic gate is scoped to code. + +## Bundles + +OKF ships knowledge as a **bundle**: a directory tree where each file is a concept, the path is its +identity, and two filenames are reserved — `index.md` (a directory listing for progressive +disclosure) and `log.md` (a change history). Point Surface at a bundle with `bundles` in +`surf.toml`: + +```toml +# Govern a flat hubs/ dir, an in-repo OKF bundle, or both. +hubs = ["hubs/*.md"] +bundles = ["knowledge/sales"] # expands to knowledge/sales/**/*.md +``` + +Reserved files are recognized and skipped for governance (they hold no claims), so a bundle's +`index.md`/`log.md` never trip the gate. `surf lint` additionally checks OKF cross-links in a hub's +body and **warns** (never blocks — OKF tolerates broken links) on a dangling `.md` target. + +## Doc systems + +Because an OKF bundle is just markdown in a directory, it drops into any doc system that reads +markdown — [Obsidian](https://obsidian.md/) vaults, Notion imports, and git-backed editors. The +intended integration is a **CI gate on the git repos those systems sync**: run +[`surf check`](../guides/ci-integration.md) as a GitHub Action over the doc repo, so the freshness +gate guards the code-anchored subset of the knowledge base wherever the docs are edited. + +## See also + +- [Authoring hubs](./authoring-hubs.md) — a hub is an onboarding doc, not a claim-log (the same + shape OKF's prose-first concepts encourage). +- [Configuration](../reference/configuration.md) — the `bundles` glob and frontmatter fields. +- [How the gate works](../reference/how-it-works.md) — what Surface hashes and why. diff --git a/docs/index.md b/docs/index.md index 4fe4b10..82ad265 100644 --- a/docs/index.md +++ b/docs/index.md @@ -91,6 +91,16 @@ If you want the fuzzy "is this claim still true" judgment, that lives in an **op plugin that reads Surface's JSON output. The core never depends on it. Pull every plugin out and the gate blocks and passes exactly the same. +## Interoperable: Surface speaks OKF + +A hub is a **conformant [Open Knowledge Format](./guides/okf.md) concept** — Google's vendor-neutral +standard for knowledge as markdown + frontmatter. OKF standardizes *how knowledge is written down* +but deliberately omits **freshness**: it has no notion of whether the thing a document describes has +changed. That omission is precisely what Surface is. So the relationship is clean: **Surface = OKF + +the freshness OKF leaves out.** Your hubs drop into any OKF consumer (Google's Knowledge Catalog, +the OKF visualizer, Obsidian, git-backed doc editors), which read the knowledge and ignore the +`anchors:` Surface governs. See [Surface and OKF](./guides/okf.md). + ## Is Surface for you? Honestly? Maybe not. Roughly, it earns its keep when @@ -116,5 +126,5 @@ Agents are a multiplier, not the foundation. ## Next - [Install](./getting-started/install.md) · [Quickstart](./getting-started/quickstart.md) -- [Authoring hubs](./guides/authoring-hubs.md) · [CI integration](./guides/ci-integration.md) · [Examples](./examples.md) +- [Authoring hubs](./guides/authoring-hubs.md) · [CI integration](./guides/ci-integration.md) · [Surface and OKF](./guides/okf.md) · [Examples](./examples.md) - Reference: [Commands](./reference/commands.md) · [Configuration](./reference/configuration.md) · [How the gate works](./reference/how-it-works.md) · [Hash recipes](./reference/hash-recipes.md) · [FAQ](./reference/faq.md) diff --git a/docs/reference/configuration.md b/docs/reference/configuration.md index f52111f..82cf500 100644 --- a/docs/reference/configuration.md +++ b/docs/reference/configuration.md @@ -30,6 +30,42 @@ layout and the trade-offs). > derived from the first pattern (e.g. `docs/hubs/*.md` → `docs/hubs/`); the other patterns are > still linted and verified normally, they're just not where `new` writes. +## OKF bundles + +To govern an [Open Knowledge Format](../guides/okf.md) bundle — a directory *tree* of concept files +rather than a flat folder — list its root(s) under `bundles`. Each root expands to `/**/*.md`: + +```toml +hubs = ["hubs/*.md"] # optional; flat layout +bundles = ["knowledge/sales"] # an in-repo OKF bundle (recursively) +``` + +`hubs` and `bundles` are unioned. OKF reserved files (`index.md`, `log.md`) swept up by a bundle +glob are recognized and **skipped for governance** — they hold no claims, so a missing frontmatter +fence in them never blocks the gate. + +## Frontmatter fields + +A hub is a **superset of an OKF concept**, so its frontmatter carries both OKF fields and Surface's +governance fields: + +| Field | Source | Notes | +| --- | --- | --- | +| `type` | OKF | The one field OKF requires. Defaults to `concept` when absent (existing hubs keep working). | +| `title` | OKF | Display name. | +| `summary` | Surface | The onboarding one-liner (optional). Distinct from OKF `description`. | +| `tags` | OKF | Cross-cutting tags. | +| `timestamp` | OKF | Last *modified* (ISO 8601). Distinct from a claim's `verified_at` (last *attested*). | +| `anchors` | Surface | The claims — the governed part. See [Authoring hubs](../guides/authoring-hubs.md). | +| `refs` / `covers` | Surface | Composition edges and advisory coverage globs. | +| *(any other key)* | OKF / tools | Preserved verbatim (e.g. OKF `description`/`resource`, a doc system's `author`/`created`). | + +Unknown *frontmatter* keys are preserved, not rejected (the OKF rule); a key that looks like a typo +of a known one earns a `surf lint` warning. Unknown keys **inside an anchor item** still fail +closed. Each claim also gains freshness provenance the first time `surf verify` stamps it: a stable +`id`, plus `verified_at` and `verified_commit` (the *who* is left to git blame, so no author email +lands in tracked files). + ## Languages TypeScript (`.ts`, `.tsx`, `.mts`, `.cts`), JavaScript/JSX (`.js`, `.jsx`, `.mjs`, `.cjs`), Rust diff --git a/hubs/cli-check.md b/hubs/cli-check.md index 4bcfcb1..1ed77f0 100644 --- a/hubs/cli-check.md +++ b/hubs/cli-check.md @@ -8,7 +8,10 @@ anchors: a mismatch → Changed; a clean match is tagged with whether the stamp was still v1. The verdict is deterministic and needs no git. at: surf-cli/src/check.rs > check_claim - hash: 2:66e7b4149d60 + hash: 2:7e984c24d736 + id: c_18be38a62d95d1d80001 + verified_at: 2026-07-01T16:53:08Z + verified_commit: 7c5aabe74da3b56ff680044aeb3b20747b606479 - claim: > Scoping is opt-in and intersective: with neither --base nor --files every claim is checked. A claim is in scope when any of its anchored files matches each active filter — the --base @@ -19,17 +22,22 @@ anchors: at: surf-cli/src/check.rs > Scope > includes hash: 2:64277175938c - claim: > - The gate fails closed: a hub whose frontmatter won't parse yields an Unresolvable - divergence (blocking the run) rather than being silently skipped, so a frontmatter typo - can't pass as clean. After the per-claim walk it propagates refs one hop — a hub that - directly references a stale hub (or a stale claim within one) inherits a ReferencedStale - divergence, built only from base divergences so a chain stops at the first hop. Alongside - the divergences it returns the --files patterns that matched no anchored file (run warns on - stderr for each and exits non-zero when every pattern matched nothing, so a typo'd --files - can't read as a clean run) and a count of clean anchors still stamped under v1, so run can - nudge the one-time `surf verify` upgrade. + The gate fails closed for concepts: a concept hub whose frontmatter won't parse yields an + Unresolvable divergence (blocking the run) rather than being silently skipped, so a + frontmatter typo can't pass as clean. OKF reserved files (index.md/log.md) are the exception + — they carry no claims, so they are skipped entirely and never block even without frontmatter. + After the per-claim walk it propagates refs one hop — a hub that directly references a stale + hub (or a stale claim within one) inherits a ReferencedStale divergence, built only from base + divergences so a chain stops at the first hop. Alongside the divergences it returns the + --files patterns that matched no anchored file (run warns on stderr for each and exits + non-zero when every pattern matched nothing, so a typo'd --files can't read as a clean run) + and a count of clean anchors still stamped under v1, so run can nudge the one-time + `surf verify` upgrade. at: surf-cli/src/check.rs > check_workspace - hash: 2:b7b7fd55206e + hash: 2:bed6c18b3cc1 + id: c_18be38a62f7582a00002 + verified_at: 2026-07-01T16:53:08Z + verified_commit: 7c5aabe74da3b56ff680044aeb3b20747b606479 refs: - ./cli-git.md - ./cli-verify.md @@ -44,10 +52,11 @@ produces the same answer; the git helpers in [`cli-git.md`](./cli-git.md) only f `check_claim` is the per-claim verdict; `check_workspace` walks every hub, and `Scope` narrows which claims it evaluates when `--base` or `--files` is given — opt-in and intersective, falling -back to a full check rather than checking nothing. Any divergence (including a hub whose -frontmatter won't parse — the gate fails closed) makes `run` exit non-zero. A hub also fails when a -hub it [`refs`](./hub-format.md) is stale: composition propagates one hop (#4), so the gate that -flags a dependency flags everything built on it. +back to a full check rather than checking nothing. Any divergence (including a *concept* hub whose +frontmatter won't parse — the gate fails closed) makes `run` exit non-zero. OKF reserved files +(`index.md`/`log.md`) hold no claims, so they are skipped rather than governed. A hub also fails +when a hub it [`refs`](./hub-format.md) is stale: composition propagates one hop (#4), so the gate +that flags a dependency flags everything built on it. **Boundary:** green means "nothing anchored changed since last sign-off," not "the prose is true"; that confirmation is [`surf verify`](./cli-verify.md)'s job, not the gate's. diff --git a/hubs/cli-workspace.md b/hubs/cli-workspace.md index 4eb0716..e32b266 100644 --- a/hubs/cli-workspace.md +++ b/hubs/cli-workspace.md @@ -7,10 +7,13 @@ anchors: at: surf-cli/src/workspace.rs > Workspace > discover hash: 2:7d57c89fcc0d - claim: > - hub_paths globs the config's hub patterns relative to the discovered root, sorted and - deduped. + hub_paths globs the config's hub patterns relative to the discovered root, then expands each + OKF bundle root as `/**/*.md`, returning the combined set sorted and deduped. at: surf-cli/src/workspace.rs > Workspace > hub_paths - hash: 2:c69c8264bcfd + hash: 2:0e986d323b98 + id: c_18be38a6357318480003 + verified_at: 2026-07-01T16:53:08Z + verified_commit: 7c5aabe74da3b56ff680044aeb3b20747b606479 refs: - ./cli-check.md - ./cli-lint.md @@ -26,8 +29,10 @@ resolved root rather than the caller's current directory. `discover` is what makes `surf` runnable from any subdirectory — it walks up to the nearest `surf.toml` (the same root-finding git and ruff use) and errors if none is found, so a stray invocation outside a project fails loudly instead of silently governing nothing. The resolved root -is the base every anchor path is joined against, and `hub_paths` globs the configured patterns -relative to it (sorted and deduped) to enumerate the hubs. +is the base every anchor path is joined against, and `hub_paths` enumerates the hubs by globbing the +configured `hubs` patterns and expanding any OKF `bundles` roots (each as `/**/*.md`), sorted +and deduped. Reserved OKF files swept up this way are classified on `LoadedHub` and skipped by the +governing commands. **Boundary:** discovery and enumeration only — it parses no hub bodies and resolves no anchors; that is [`lint`](./cli-lint.md)/[`check`](./cli-check.md)'s job over the files this hands back. diff --git a/hubs/hub-format.md b/hubs/hub-format.md index e359391..849f797 100644 --- a/hubs/hub-format.md +++ b/hubs/hub-format.md @@ -1,18 +1,34 @@ --- -summary: The hub document format and the minimal-diff frontmatter editor used by verify. +summary: The hub document format (an OKF concept superset) and the minimal-diff frontmatter editor used by verify. anchors: - claim: > - A hub is a `---`-fenced YAML frontmatter block followed by a markdown body; `at:` is a - scalar or a list, hash is optional until verified, and unknown fields are rejected — while - `refs`/`covers` are accepted and stored verbatim, parse_hub resolving neither (acting on them - is lint/check's job). - at: surf-core/src/hub.rs > parse_hub - hash: 2:c510c6032ba7 + A hub is a `---`-fenced YAML frontmatter block followed by a markdown body, and its + frontmatter is a superset of an OKF concept: `type` (defaulted to `concept`, so pre-OKF hubs + stay valid), `title`, `tags`, `timestamp` sit alongside Surface's `anchors`/`refs`/`covers`, + and every other key (OKF `description`/`resource`, a doc system's `author`/`created`) is + preserved verbatim in `extra` — unknown *frontmatter* keys are kept, not rejected, per OKF. + Inside an anchor item `at:` is a scalar or list, `hash` is optional until verified, and + unknown keys there ARE still rejected (a per-anchor typo fails closed). parse_hub resolves + neither refs nor covers — acting on them is lint/check's job. + at: + - surf-core/src/hub.rs > parse_hub + - surf-core/src/hub.rs > Frontmatter + - surf-core/src/hub.rs > Claim + hash: 2:6f2be9c95177 + id: c_18be38a6388e79780004 + verified_at: 2026-07-01T17:29:13Z + verified_commit: e9e86af7ce662b0f9b26eb379e952d09d9685c05 - claim: > - verify writes hashes back surgically: set_anchor_hash locates the Nth anchor item and - replaces/inserts only its hash line, so an unchanged hash is byte-identical. - at: surf-core/src/hub.rs > set_anchor_hash - hash: 2:29805baa85ea + verify writes fields back surgically: set_anchor_field (which set_anchor_hash wraps) locates + the Nth anchor item and replaces/inserts only that one key's line, so an unchanged write is + byte-identical — the same primitive stamps hash, id, and verified_* provenance. + at: + - surf-core/src/hub.rs > set_anchor_field + - surf-core/src/hub.rs > set_anchor_hash + hash: 2:592b1c643978 + id: c_18be38a639b1f2a80005 + verified_at: 2026-07-01T16:53:09Z + verified_commit: 7c5aabe74da3b56ff680044aeb3b20747b606479 refs: - ./cli-lint.md - ./cli-check.md @@ -22,17 +38,26 @@ covers: # Hub format -A hub is the unit every command reads and writes: a `---`-fenced YAML frontmatter block (the -machine-checkable `anchors`) followed by a markdown body (the prose a human or agent reads). -`parse_hub` is the contract everything else binds to — its shape is why `at:` can be a scalar or a -list, why `hash` is optional until verified, and why unknown fields are rejected (so a typo can't -masquerade as a new field) while `refs`/`covers` are accepted and lint-validated — `covers` never -gates, but a stale `refs` target now propagates into the [`check`](./cli-check.md) verdict (#4). +A hub is the unit every command reads and writes: a `---`-fenced YAML frontmatter block (OKF +concept fields plus Surface's machine-checkable `anchors`) followed by a markdown body (the prose a +human or agent reads). `parse_hub` is the contract everything else binds to. + +**A hub is an OKF concept, plus freshness.** The frontmatter is a *superset* of an +[Open Knowledge Format](../docs/guides/okf.md) concept: it carries OKF's `type`/`title`/`tags`/ +`timestamp` (and preserves any other key in `extra`, since OKF requires consumers to keep unknown +fields), so a hub is a conformant OKF concept that any OKF reader can consume — while Surface's +`anchors` add the freshness OKF omits. That is why `deny_unknown_fields` is *off* for the +frontmatter (a typo'd key is caught by a `surf lint` warning instead of a hard error) but stays +*on* for each anchor item, where an unknown key is a genuine mistake that should fail closed. +`covers` never gates; a stale `refs` target propagates into the [`check`](./cli-check.md) verdict (#4). **The distinction that drives the design:** a human reviews every write, so edits must be -*surgical*. Writes go through the line-level editor (`set_anchor_hash` / `set_anchor_at`) rather -than re-serializing the frontmatter — re-serializing would reorder keys and reflow scalars, burying -the one changed line in a noisy diff. An unchanged hash rewrite is therefore byte-identical. +*surgical*. Writes go through the line-level editor (`set_anchor_field`, which `set_anchor_hash` +wraps, plus `set_anchor_at`) rather than re-serializing the frontmatter — re-serializing would +reorder keys, reflow scalars, and drop the preserved `extra` ordering, burying the one changed line +in a noisy diff. An unchanged write is therefore byte-identical, which is what keeps a no-op +`surf verify` from churning the file (and what lets it stamp `id`/`verified_*` provenance only when +the hash actually changed). **Boundary:** this module is pure parsing and text editing — it resolves no anchors and computes no hashes; it only produces the structure [`lint`](./cli-lint.md)/[`check`](./cli-check.md) act on. diff --git a/surf-cli/src/check.rs b/surf-cli/src/check.rs index f4735f0..6ec73dc 100644 --- a/surf-cli/src/check.rs +++ b/surf-cli/src/check.rs @@ -73,6 +73,11 @@ fn check_workspace( let mut stale: HashMap<&str, Vec>> = HashMap::new(); for loaded_hub in &loaded { + // OKF reserved files (index.md/log.md) hold no claims — never governed, and a missing + // frontmatter fence must not fail the gate closed the way a malformed *concept* does. + if loaded_hub.kind != surf_core::DocKind::Concept { + continue; + } let rel = loaded_hub.rel.as_str(); let hub = match &loaded_hub.hub { Ok(hub) => hub, @@ -119,6 +124,9 @@ fn propagate_refs( ) -> Vec { let mut out = Vec::new(); for loaded_hub in loaded { + if loaded_hub.kind != surf_core::DocKind::Concept { + continue; + } let Ok(hub) = &loaded_hub.hub else { continue }; for raw in &hub.frontmatter.refs { let Ok(parsed) = parse_ref(raw) else { continue }; @@ -151,6 +159,7 @@ fn propagate_refs( claim: String::new(), at: raw.clone(), kind: DivergenceKind::ReferencedStale, + id: None, old_hash: None, new_hash: None, old_code: None, @@ -170,6 +179,7 @@ fn malformed_hub_divergence(hub: &str, err: &HubError) -> Divergence { claim: String::new(), at: String::new(), kind: DivergenceKind::Unresolvable, + id: None, old_hash: None, new_hash: None, old_code: None, @@ -276,6 +286,7 @@ fn check_claim(ws: &Workspace, hub: &str, claim: &surf_core::Claim, base: &str) claim: prose.clone(), at: at_display.clone(), kind, + id: claim.id.clone(), old_hash, new_hash, old_code, @@ -627,6 +638,61 @@ mod tests { ); } + #[test] + fn okf_reserved_index_file_does_not_block() { + // An OKF `index.md` (directory listing, no frontmatter fence) is swept up by the + // `hubs/*.md` glob but must never fail the gate closed the way a malformed concept does. + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + write(root, "surf.toml", ""); + write( + root, + "hubs/index.md", + "# Sales\n\n- [orders](./orders.md)\n", + ); + assert!(check_workspace(&ws_at(root.to_path_buf()), None, &[]) + .unwrap() + .0 + .is_empty()); + } + + #[test] + fn anchor_less_okf_concept_passes() { + // A concept carrying only OKF fields (no code anchors) is valid and ungoverned — it + // contributes zero divergences. + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + write(root, "surf.toml", ""); + write( + root, + "hubs/orders.md", + "---\ntype: BigQuery Table\ndescription: one row per order\ntags: [sales]\n---\n# Orders\n", + ); + assert!(check_workspace(&ws_at(root.to_path_buf()), None, &[]) + .unwrap() + .0 + .is_empty()); + } + + #[test] + fn unknown_frontmatter_key_no_longer_blocks_check() { + // Relaxing deny_unknown_fields for OKF interop means a typo'd key (`anchros:`) is now kept + // in `extra` instead of hard-blocking the gate — the lost signal is recovered by `surf lint`. + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + write(root, "surf.toml", ""); + write( + root, + "hubs/a.md", + "---\nsummary: x\nanchros:\n - claim: c\n at: src/m.rs > add\n---\n", + ); + // Parses (typo'd `anchros` is preserved, real `anchors` is empty) → no divergence. + assert!(check_workspace(&ws_at(root.to_path_buf()), None, &[]) + .unwrap() + .0 + .is_empty()); + } + #[test] fn malformed_hub_blocks_check() { // A frontmatter typo must fail the gate, not pass silently (#35). diff --git a/surf-cli/src/git.rs b/surf-cli/src/git.rs index 9ec73cf..3294734 100644 --- a/surf-cli/src/git.rs +++ b/surf-cli/src/git.rs @@ -130,6 +130,21 @@ pub fn show(root: &Path, base: &str, rel_file: &str) -> Option { .then(|| String::from_utf8_lossy(&output.stdout).into_owned()) } +/// The current commit's full SHA (`git rev-parse HEAD`), for recording verify-event provenance. +/// `None` outside a repo or before the first commit. +pub fn head_sha(root: &Path) -> Option { + let output = Command::new("git") + .current_dir(root) + .args(["rev-parse", "HEAD"]) + .output() + .ok()?; + output + .status + .success() + .then(|| String::from_utf8_lossy(&output.stdout).trim().to_string()) + .filter(|s| !s.is_empty()) +} + /// The path `old_path` was renamed to, per git's rename detection between HEAD and the working /// tree. `None` if git can't answer or no rename pairs with `old_path`. Best-effort: a pure /// `mv` without a content match may show as delete+add and not be detected. diff --git a/surf-cli/src/lint.rs b/surf-cli/src/lint.rs index 5571f1b..4c4b4b5 100644 --- a/surf-cli/src/lint.rs +++ b/surf-cli/src/lint.rs @@ -109,12 +109,18 @@ fn lint_workspace(ws: &Workspace) -> Result> { // hubs by rel. A malformed hub is absent from the index (it gets its own block below), so a // ref into it reads as "does not resolve to a hub" — which it effectively doesn't. let loaded = ws.iter_hubs()?; + // Reserved OKF files (index.md/log.md) are not concepts: they never carry claims and a `ref` + // can't target them, so keep them out of the concept index and skip them below. let hub_index: HashMap<&str, &surf_core::Hub> = loaded .iter() + .filter(|l| l.kind == surf_core::DocKind::Concept) .filter_map(|l| l.hub.as_ref().ok().map(|h| (l.rel.as_str(), h))) .collect(); for loaded_hub in &loaded { + if loaded_hub.kind != surf_core::DocKind::Concept { + continue; + } let rel = loaded_hub.rel.as_str(); let hub = match &loaded_hub.hub { Ok(hub) => hub, @@ -165,6 +171,8 @@ fn lint_workspace(ws: &Workspace) -> Result> { lint_refs(rel, hub, &hub_index, &mut findings); lint_claim_log(rel, hub, &mut findings); lint_thin_prose(rel, hub, &mut findings); + lint_okf_frontmatter(rel, hub, &mut findings); + lint_okf_links(ws, rel, hub, &mut findings); if hub.frontmatter.anchors.len() > MAX_ANCHORS_PER_HUB { findings.push(Finding { @@ -355,6 +363,42 @@ fn lint_refs( } } +/// OKF cross-links are plain markdown links between concepts. OKF **tolerates** broken links (they +/// may be not-yet-written knowledge), so this only ever *warns* — it never blocks. Checks local +/// `.md` links (bundle-relative `/x.md` resolved from the workspace root, or relative `./x.md` from +/// the hub's directory), skipping URLs, bare `#anchors`, and non-markdown assets. Best-effort: for a +/// bundle mounted in a subdirectory, an absolute `/x.md` may resolve against the wrong root, so a +/// spurious warning is possible — never a block. +fn lint_okf_links(ws: &Workspace, rel: &str, hub: &surf_core::Hub, findings: &mut Vec) { + for raw in link_targets(&hub.body) { + let path = raw.split('#').next().unwrap_or(raw).trim(); + if path.is_empty() + || path.contains("://") + || path.starts_with("mailto:") + || path.starts_with("//") + || !path.ends_with(".md") + { + continue; + } + let target = match path.strip_prefix('/') { + Some(abs) => abs.to_string(), + None => crate::workspace::resolve_ref_path(rel, path), + }; + if target.is_empty() || ws.root.join(&target).is_file() { + continue; + } + findings.push(Finding { + severity: Severity::Warn, + hub: rel.to_string(), + claim: String::new(), + at: raw.to_string(), + message: format!( + "OKF cross-link `{raw}` points at `{target}`, which doesn't exist — fine if it's not-yet-written, else check for a typo (advisory; OKF tolerates broken links)" + ), + }); + } +} + /// Markdown link targets (`](target)`) in a fragment of text. fn link_targets(text: &str) -> impl Iterator { text.split("](") @@ -556,6 +600,89 @@ fn lint_thin_prose(rel: &str, hub: &surf_core::Hub, findings: &mut Vec) } } +/// OKF/round-trip advisories on a concept's frontmatter. Relaxing `deny_unknown_fields` (so OKF's +/// "consumers MUST preserve unknown keys" rule holds) means a typo'd top-level key no longer +/// hard-blocks — recover that signal as a warning. Also nudge an anchored hub with no +/// human-readable headline: a hub is an onboarding doc, so a reader needs something to orient on. +fn lint_okf_frontmatter(rel: &str, hub: &surf_core::Hub, findings: &mut Vec) { + const KNOWN_KEYS: [&str; 8] = [ + "anchors", + "refs", + "covers", + "summary", + "title", + "tags", + "timestamp", + "type", + ]; + for (k, _) in &hub.frontmatter.extra { + let Some(key) = k.as_str() else { continue }; + if let Some(hit) = KNOWN_KEYS + .iter() + .find(|known| within_edit_distance_1(key, known)) + { + findings.push(Finding { + severity: Severity::Warn, + hub: rel.to_string(), + claim: String::new(), + at: String::new(), + message: format!( + "unknown frontmatter key `{key}` — did you mean `{hit}`? (unknown keys are preserved for OKF interop, so a typo no longer hard-blocks the gate)" + ), + }); + } + } + + if !hub.frontmatter.anchors.is_empty() + && hub.frontmatter.summary.is_none() + && hub.frontmatter.title.is_none() + && !hub.frontmatter.extra.contains_key("description") + { + findings.push(Finding { + severity: Severity::Warn, + hub: rel.to_string(), + claim: String::new(), + at: String::new(), + message: + "anchored hub has no headline (`summary`, `title`, or `description`) — a hub is an onboarding doc; give readers something to orient on" + .to_string(), + }); + } +} + +/// True when `a` and `b` are one edit apart — a single insert, delete, substitution, or adjacent +/// transposition. Transposition matters: `anchros` → `anchors` is the classic fat-fingered key and +/// is only distance 1 once swaps count. Identical strings return `false` (nothing to warn about). +fn within_edit_distance_1(a: &str, b: &str) -> bool { + a != b && osa_distance(a, b) == 1 +} + +/// Optimal string alignment distance (Levenshtein plus adjacent transposition). Frontmatter keys +/// are short, so the full DP is negligible. +fn osa_distance(a: &str, b: &str) -> usize { + let a: Vec = a.chars().collect(); + let b: Vec = b.chars().collect(); + let (n, m) = (a.len(), b.len()); + let mut d = vec![vec![0usize; m + 1]; n + 1]; + for (i, row) in d.iter_mut().enumerate() { + row[0] = i; + } + d[0] = (0..=m).collect(); + for i in 1..=n { + for j in 1..=m { + let cost = usize::from(a[i - 1] != b[j - 1]); + let mut best = (d[i - 1][j] + 1) + .min(d[i][j - 1] + 1) + .min(d[i - 1][j - 1] + cost); + if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1] { + best = best.min(d[i - 2][j - 2] + 1); + } + d[i][j] = best; + } + } + d[n][m] +} + /// Words of readable body prose, excluding fenced code blocks (``` … ```), which carry no /// onboarding prose and would otherwise inflate the count. fn prose_words(body: &str) -> usize { @@ -1076,6 +1203,115 @@ mod tests { assert_eq!(block.severity, Severity::Block); } + #[test] + fn typo_frontmatter_key_warns_not_blocks() { + // `anchros` is one edit from `anchors` → a warning that recovers the fail-closed signal + // relaxing deny_unknown_fields gave up. Never a block (OKF preserves unknown keys). + let (_t, ws) = ws_with(&[( + "hubs/a.md", + "---\nsummary: x\nanchros:\n - claim: c\n at: src/m.rs > add\n---\n", + )]); + let f = lint_workspace(&ws).unwrap(); + let warn = f + .iter() + .find(|x| x.message.contains("did you mean `anchors`")) + .expect("expected a typo warning"); + assert_eq!(warn.severity, Severity::Warn); + assert!(!f.iter().any(|x| x.severity == Severity::Block)); + } + + #[test] + fn unrelated_okf_key_is_not_flagged_as_typo() { + // A legitimate OKF/doc-system key (well clear of any known key) must not warn. + let (_t, ws) = ws_with(&[ + ("src/m.rs", "pub fn a() {}\n"), + ( + "hubs/a.md", + "---\ntype: Runbook\ndescription: how to deploy\nauthor: rachel\nanchors:\n - claim: a\n at: src/m.rs > a\n---\n", + ), + ]); + let f = lint_workspace(&ws).unwrap(); + assert!( + !f.iter().any(|x| x.message.contains("did you mean")), + "no typo warning expected: {f:?}" + ); + } + + #[test] + fn anchored_hub_without_headline_warns() { + // An anchored hub with no summary/title/description reads as a claim dump, not an + // onboarding doc. + let (_t, ws) = ws_with(&[ + ("src/m.rs", "pub fn a() {}\n"), + ( + "hubs/a.md", + "---\ntype: concept\nanchors:\n - claim: a does a\n at: src/m.rs > a\n---\n", + ), + ]); + let f = lint_workspace(&ws).unwrap(); + let warn = f + .iter() + .find(|x| x.message.contains("no headline")) + .expect("expected a headline warning"); + assert_eq!(warn.severity, Severity::Warn); + } + + #[test] + fn reserved_index_file_is_not_linted() { + // A plain OKF index.md (no frontmatter) must not produce a block from lint. + let (_t, ws) = ws_with(&[("hubs/index.md", "# Sales\n\n- [orders](./orders.md)\n")]); + let f = lint_workspace(&ws).unwrap(); + assert!(f.is_empty(), "reserved file should not be linted: {f:?}"); + } + + #[test] + fn edit_distance_1_matches_only_close_keys() { + assert!(within_edit_distance_1("anchros", "anchors")); // adjacent transposition + assert!(within_edit_distance_1("anchor", "anchors")); // one deletion + assert!(within_edit_distance_1("tag", "tags")); // one insertion + assert!(within_edit_distance_1("titel", "title")); // el↔le transposition + assert!(!within_edit_distance_1("resource", "anchors")); // unrelated + assert!(!within_edit_distance_1("anchors", "anchors")); // identical → no warning + assert!(!within_edit_distance_1("tg", "tags")); // two edits away + } + + #[test] + fn okf_dangling_cross_link_warns_never_blocks() { + // A body link to a non-existent concept warns (advisory) but never blocks — OKF tolerates + // broken links. + let (_t, ws) = ws_with(&[( + "hubs/orders.md", + "---\ntype: BigQuery Table\ndescription: orders\n---\n# Orders\n\nJoined with [customers](./customers.md).\n", + )]); + let f = lint_workspace(&ws).unwrap(); + let warn = f + .iter() + .find(|x| x.message.contains("OKF cross-link")) + .expect("expected a dangling-link warning"); + assert_eq!(warn.severity, Severity::Warn); + assert!(!f.iter().any(|x| x.severity == Severity::Block)); + } + + #[test] + fn okf_resolvable_cross_link_is_silent() { + // The link target exists → no warning. URLs and anchors are ignored too. + let (_t, ws) = ws_with(&[ + ( + "hubs/orders.md", + "---\ntype: BigQuery Table\ndescription: orders\n---\n# Orders\n\nSee [customers](./customers.md), the [docs](https://x.io/a.md), and [top](#orders).\n", + ), + ( + "hubs/customers.md", + "---\ntype: BigQuery Table\ndescription: customers\n---\n# Customers\n", + ), + ]); + let f = lint_workspace(&ws).unwrap(); + assert!( + !f.iter().any(|x| x.message.contains("OKF cross-link")), + "no dangling-link warning expected: {f:?}" + ); + } + fn agents_findings(ws: &Workspace) -> Vec { lint_workspace(ws) .unwrap() diff --git a/surf-cli/src/new.rs b/surf-cli/src/new.rs index 1dbfb12..b773b0e 100644 --- a/surf-cli/src/new.rs +++ b/surf-cli/src/new.rs @@ -37,6 +37,8 @@ fn template(name: &str) -> String { // Explicit `\n` (not `\`-continuation, which would strip the example's indentation). let mut s = String::new(); s.push_str("---\n"); + // `type` is OKF's one required field, so a scaffolded hub is a conformant OKF concept on disk. + s.push_str("type: concept\n"); s.push_str("summary: TODO one-line summary of this domain.\n"); s.push_str("anchors: []\n"); s.push_str("refs: []\n"); diff --git a/surf-cli/src/verify.rs b/surf-cli/src/verify.rs index cf22dc0..b14a0ff 100644 --- a/surf-cli/src/verify.rs +++ b/surf-cli/src/verify.rs @@ -12,7 +12,8 @@ use serde::Serialize; use std::process::ExitCode; use surf_core::{ combine_site_hashes, find_renamed, format_stamp, hash_anchor_raw, hash_anchor_with, - parse_anchor, parse_hub, parse_stamp, set_anchor_at, set_anchor_hash, HashOpts, Recipe, + parse_anchor, parse_hub, parse_stamp, set_anchor_at, set_anchor_field, set_anchor_hash, Claim, + HashOpts, Recipe, }; enum Plan { @@ -88,6 +89,8 @@ fn print_human(report: &VerifyReport) { fn verify_all(ws: &Workspace, target: Option<&str>, follow: bool) -> Result { let mut report = VerifyReport::default(); let mut matched_any = false; + // Monotonic per-run counter so two claims stamped in the same instant get distinct ids. + let mut id_seed = 0u64; for hub_path in ws.hub_paths()? { let rel = hub_path @@ -113,7 +116,9 @@ fn verify_all(ws: &Workspace, target: Option<&str>, follow: bool) -> Result match set_anchor_hash(&text, idx, &new_hash) { + Plan::Hash(new_hash) => match set_anchor_hash(&text, idx, &new_hash) + .and_then(|t| stamp_provenance(&t, idx, claim, ws, &mut id_seed)) + { Some(updated) => { text = updated; report.stamped += 1; @@ -129,6 +134,7 @@ fn verify_all(ws: &Workspace, target: Option<&str>, follow: bool) -> Result { match set_anchor_at(&text, idx, &new_at) .and_then(|t| set_anchor_hash(&t, idx, &new_hash)) + .and_then(|t| stamp_provenance(&t, idx, claim, ws, &mut id_seed)) { Some(updated) => { text = updated; @@ -297,6 +303,69 @@ fn site_hash(ws: &Workspace, site: &str, opts: HashOpts) -> std::result::Result< hash_anchor_raw(&source, lang, &anchor, opts, Recipe::CURRENT).map_err(|e| e.to_string()) } +/// Record the freshness provenance OKF omits — *only* when the hash actually changed (this runs on +/// `Plan::Hash`/`Plan::Follow`, never on `Plan::Unchanged`), so a no-op re-verify stays +/// byte-identical. Assigns a stable `id` the first time a claim is stamped and never regenerates it, +/// so the identity survives later prose/anchor edits. `verified_commit` is best-effort (omitted when +/// git can't answer); the *who* is deliberately not recorded — git blame on the hub has it, and +/// keeping author emails out of tracked files matters for public repos. +fn stamp_provenance( + text: &str, + idx: usize, + claim: &Claim, + ws: &Workspace, + id_seed: &mut u64, +) -> Option { + let mut out = text.to_string(); + if claim.id.is_none() { + out = set_anchor_field(&out, idx, "id", &new_claim_id(id_seed))?; + } + out = set_anchor_field(&out, idx, "verified_at", &iso8601_utc_now())?; + if let Some(sha) = git::head_sha(&ws.root) { + out = set_anchor_field(&out, idx, "verified_commit", &sha)?; + } + Some(out) +} + +/// A stable, opaque, content-independent claim id. Written once and never regenerated, so a claim's +/// identity is decoupled from its prose and anchor — the substrate for claim timelines. +fn new_claim_id(seed: &mut u64) -> String { + *seed = seed.wrapping_add(1); + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_nanos()) + .unwrap_or(0); + format!("c_{nanos:x}{:04x}", *seed) +} + +/// Current UTC time as an ISO-8601 second-precision string (`2026-07-01T14:30:00Z`), matching OKF's +/// `timestamp` format. Hand-rolled to avoid a date dependency; the tool is Unix-only. +fn iso8601_utc_now() -> String { + let secs = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + let (days, rem) = ((secs / 86_400) as i64, secs % 86_400); + let (h, m, s) = (rem / 3600, (rem % 3600) / 60, rem % 60); + let (y, mo, d) = civil_from_days(days); + format!("{y:04}-{mo:02}-{d:02}T{h:02}:{m:02}:{s:02}Z") +} + +/// Howard Hinnant's days-from-civil inverse: days since 1970-01-01 → (year, month, day), proleptic +/// Gregorian. +fn civil_from_days(z: i64) -> (i64, u32, u32) { + let z = z + 719_468; + let era = if z >= 0 { z } else { z - 146_096 } / 146_097; + let doe = (z - era * 146_097) as u64; + let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365; + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = (doy - (153 * mp + 2) / 5 + 1) as u32; + let m = if mp < 10 { mp + 3 } else { mp - 9 } as u32; + (if m <= 2 { y + 1 } else { y }, m, d) +} + #[cfg(test)] mod tests { use super::*; @@ -354,6 +423,99 @@ mod tests { assert_eq!(fs::read_to_string(root.join("hubs/a.md")).unwrap(), after); } + #[test] + fn verify_stamps_id_and_provenance() { + // The freshness OKF omits: a first verify records a stable id plus when/which-commit. The + // *who* is deliberately not stored (git blame has it) — no author email in tracked files. + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + write( + root, + "src/m.rs", + "pub fn add(a: i64, b: i64) -> i64 { a + b }\n", + ); + write(root, "surf.toml", ""); + write( + root, + "hubs/a.md", + "---\nsummary: s\nanchors:\n - claim: add sums\n at: src/m.rs > add\n---\n", + ); + let git = |args: &[&str]| { + assert!( + std::process::Command::new("git") + .current_dir(root) + .args(args) + .status() + .unwrap() + .success(), + "git {args:?} failed" + ); + }; + git(&["init", "-q"]); + git(&["config", "user.email", "dev@nansen.ai"]); + git(&["config", "user.name", "Dev"]); + git(&["add", "."]); + git(&["commit", "-q", "-m", "v1"]); + + let ws = Workspace::discover(root).unwrap(); + let report = verify_all(&ws, None, false).unwrap(); + assert_eq!(report.stamped, 1); + + let hub = parse_hub(&fs::read_to_string(root.join("hubs/a.md")).unwrap()).unwrap(); + let c = &hub.frontmatter.anchors[0]; + assert!(c.hash.is_some()); + assert!(c.id.as_deref().unwrap().starts_with("c_"), "id: {:?}", c.id); + let at = c.verified_at.as_deref().unwrap(); + assert!(at.ends_with('Z') && at.contains('T'), "verified_at: {at}"); + assert!(c.verified_commit.is_some()); + } + + #[test] + fn claim_id_is_stable_across_reverify() { + // The id is written once and never regenerated: a later re-stamp (code changed) keeps it, + // so claim timelines stitch across the change. + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + write(root, "surf.toml", ""); + write( + root, + "src/m.rs", + "pub fn add(a: i64, b: i64) -> i64 { a + b }\n", + ); + write( + root, + "hubs/a.md", + "---\nsummary: s\nanchors:\n - claim: add sums\n at: src/m.rs > add\n---\n", + ); + let ws = Workspace::discover(root).unwrap(); + + verify_all(&ws, None, false).unwrap(); + let id1 = parse_hub(&fs::read_to_string(root.join("hubs/a.md")).unwrap()) + .unwrap() + .frontmatter + .anchors[0] + .id + .clone() + .unwrap(); + + // Change the anchored logic so the next verify re-stamps (Plan::Hash). + write( + root, + "src/m.rs", + "pub fn add(a: i64, b: i64) -> i64 { a - b }\n", + ); + let report = verify_all(&ws, None, false).unwrap(); + assert_eq!(report.stamped, 1, "changed code re-stamps"); + let id2 = parse_hub(&fs::read_to_string(root.join("hubs/a.md")).unwrap()) + .unwrap() + .frontmatter + .anchors[0] + .id + .clone() + .unwrap(); + assert_eq!(id1, id2, "id must survive a re-stamp"); + } + #[test] fn verify_upgrades_v1_stamp_to_v2() { // A repo stamped under the old recipe: one `surf verify` re-stamps the still-matching v1 diff --git a/surf-cli/src/workspace.rs b/surf-cli/src/workspace.rs index 0c1b5bc..f8cb304 100644 --- a/surf-cli/src/workspace.rs +++ b/surf-cli/src/workspace.rs @@ -5,7 +5,7 @@ use anyhow::{Context, Result}; use std::path::{Component, Path, PathBuf}; use surf_core::config::{parse_config, Config, CONFIG_FILE}; -use surf_core::{parse_anchor, parse_hub, Anchor, Hub, HubError, Lang}; +use surf_core::{doc_kind, parse_anchor, parse_hub, Anchor, DocKind, Hub, HubError, Lang}; pub struct Workspace { pub root: PathBuf, @@ -14,9 +14,12 @@ pub struct Workspace { /// One hub file located, read, and parsed. `hub` carries the parse result per-hub so each /// command must consciously decide what to do with a malformed hub (block, skip, warn) -/// rather than re-implementing — and diverging on — that choice. +/// rather than re-implementing — and diverging on — that choice. `kind` marks OKF reserved +/// files (`index.md`/`log.md`), which hold no claims and must never block the gate when they +/// lack frontmatter. pub struct LoadedHub { pub rel: String, + pub kind: DocKind, pub hub: Result, } @@ -42,20 +45,35 @@ impl Workspace { pub fn hub_paths(&self) -> Result> { let mut out = Vec::new(); + // Flat hub globs, verbatim. for pattern in &self.config.hubs { - let joined = self.root.join(pattern); + self.glob_into(pattern, &mut out)?; + } + // OKF bundle roots: each is a directory tree, so match every `.md` beneath it. Reserved + // files (index.md/log.md) are swept up here and classified/skipped downstream. + for root in &self.config.bundles { + let joined = PathBuf::from(root).join("**/*.md"); let pattern = joined .to_str() - .with_context(|| format!("hub glob is not valid UTF-8: {}", joined.display()))?; - for entry in glob::glob(pattern).context("invalid hub glob pattern")? { - out.push(entry?); - } + .with_context(|| format!("bundle root is not valid UTF-8: {root}"))?; + self.glob_into(pattern, &mut out)?; } out.sort(); out.dedup(); Ok(out) } + fn glob_into(&self, pattern: &str, out: &mut Vec) -> Result<()> { + let joined = self.root.join(pattern); + let pattern = joined + .to_str() + .with_context(|| format!("hub glob is not valid UTF-8: {}", joined.display()))?; + for entry in glob::glob(pattern).context("invalid hub glob pattern")? { + out.push(entry?); + } + Ok(()) + } + /// Read and parse every hub. I/O failure hard-errors the run (an unreadable hub is /// exceptional); a *parse* failure is carried per-hub in `LoadedHub::hub` so each /// caller handles it explicitly. @@ -70,7 +88,8 @@ impl Workspace { let content = std::fs::read_to_string(&path) .with_context(|| format!("reading {}", path.display()))?; let hub = parse_hub(&content); - out.push(LoadedHub { rel, hub }); + let kind = doc_kind(&rel); + out.push(LoadedHub { rel, kind, hub }); } Ok(out) } @@ -177,4 +196,35 @@ mod tests { .collect(); assert_eq!(names, vec!["auth.md", "billing.md"]); } + + #[test] + fn bundle_root_discovers_nested_concepts_and_reserved_files() { + // An OKF bundle root is a directory tree: every `.md` beneath it is discovered (concepts and + // reserved index.md/log.md alike), classified by `LoadedHub::kind`. + let tmp = tempfile::tempdir().unwrap(); + let root = tmp.path(); + fs::write(root.join(CONFIG_FILE), "hubs = []\nbundles = [\"sales\"]\n").unwrap(); + fs::create_dir_all(root.join("sales/tables")).unwrap(); + fs::write(root.join("sales/index.md"), "# Sales\n").unwrap(); + fs::write( + root.join("sales/tables/orders.md"), + "---\ntype: BigQuery Table\ndescription: orders\n---\n# Orders\n", + ) + .unwrap(); + fs::write(root.join("sales/tables/log.md"), "# Log\n").unwrap(); + + let ws = Workspace::discover(root).unwrap(); + let loaded = ws.iter_hubs().unwrap(); + let index = loaded + .iter() + .find(|l| l.rel.ends_with("index.md")) + .expect("index.md discovered"); + assert_eq!(index.kind, DocKind::Index); + let orders = loaded + .iter() + .find(|l| l.rel.ends_with("orders.md")) + .expect("concept discovered"); + assert_eq!(orders.kind, DocKind::Concept); + assert!(loaded.iter().any(|l| l.kind == DocKind::Log)); + } } diff --git a/surf-core/src/config.rs b/surf-core/src/config.rs index 27d19a0..a464532 100644 --- a/surf-core/src/config.rs +++ b/surf-core/src/config.rs @@ -9,9 +9,15 @@ pub const CONFIG_FILE: &str = "surf.toml"; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct Config { - /// Globs (relative to the config's directory) that enumerate hub documents. + /// Globs (relative to the config's directory) that enumerate hub documents. Best for the flat + /// `hubs/*.md` layout. #[serde(default = "default_hubs")] pub hubs: Vec, + /// OKF bundle roots (relative to the config's directory): each is a directory *tree* of concept + /// files, expanded as `/**/*.md` with OKF reserved files (`index.md`/`log.md`) handled. + /// Empty by default; add roots to govern an OKF bundle in place of, or alongside, `hubs`. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub bundles: Vec, } fn default_hubs() -> Vec { @@ -22,6 +28,7 @@ impl Default for Config { fn default() -> Self { Config { hubs: default_hubs(), + bundles: Vec::new(), } } } @@ -70,4 +77,14 @@ mod tests { let err = parse_config("nonsense = true").unwrap_err(); assert!(matches!(err, ConfigError::Toml(_))); } + + #[test] + fn bundles_default_empty_and_parse() { + assert!(parse_config("").unwrap().bundles.is_empty()); + let cfg = parse_config("bundles = [\"sales\", \"docs/okf\"]").unwrap(); + assert_eq!( + cfg.bundles, + vec!["sales".to_string(), "docs/okf".to_string()] + ); + } } diff --git a/surf-core/src/hub.rs b/surf-core/src/hub.rs index 956e42b..992b5c8 100644 --- a/surf-core/src/hub.rs +++ b/surf-core/src/hub.rs @@ -4,37 +4,82 @@ use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, PartialEq, Eq)] +// `Frontmatter`/`Hub` can't derive `Eq`: `extra` holds `serde_yaml::Value`s (floats aren't `Eq`). +#[derive(Debug, Clone, PartialEq)] pub struct Hub { pub frontmatter: Frontmatter, pub body: String, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] +/// A hub's frontmatter is a **superset of an OKF concept**: the OKF-recognized fields (`type`, +/// `title`, `description`, `tags`, `timestamp`, …) sit alongside Surface's own governance fields +/// (`anchors`, `refs`, `covers`). Per the OKF contract, unknown keys are **preserved, not +/// rejected** — `extra` captures every field Surface doesn't name (OKF's `description`/`resource`, +/// a doc system's `author`/`created`/`pinned`, future OKF keys) so they round-trip verbatim. +/// `deny_unknown_fields` is therefore off here; a typo'd key is surfaced by a `surf lint` warning +/// (see `lint`) instead of a hard parse error. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Frontmatter { - pub summary: String, + /// OKF's one required field. Defaulted to `concept` so pre-OKF hubs (which never wrote it) still + /// parse; the default is not serialized, so an untouched hub stays byte-identical. + #[serde( + rename = "type", + default = "default_type", + skip_serializing_if = "is_default_type" + )] + pub concept_type: String, + /// OKF display name; consumers may fall back to the filename when absent. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub title: Option, + /// Surface's onboarding one-liner. Optional now (an OKF concept may carry only `description`). + /// Kept distinct from OKF `description` (which lives in `extra`) so existing hubs are untouched. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub summary: Option, + /// OKF cross-cutting tags. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub tags: Vec, + /// OKF last-*modified* timestamp (ISO 8601). Distinct from per-claim `verified_at` (last + /// *attested*): Surface reads this but does not manage it. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub timestamp: Option, #[serde(default)] pub anchors: Vec, - /// Hub composition. Forward-declared per §9.3 — parsed but inert in the MVP. - #[serde(default)] + /// Hub composition (§9.3, #4): typed staleness edges, distinct from OKF's untyped body links. + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub refs: Vec, - /// Advisory coverage scope: repo-root-relative globs (same dialect as `config.hubs`) - /// naming the files this hub claims a relationship with. Forward-declared per §9.1 — - /// parsed, stored, and lint-validated, but **the verdict never reads it** (§5/§8). The - /// louder coverage pass that consumes these globs is deferred to #5. + /// Advisory coverage scope: repo-root-relative globs (same dialect as `config.hubs`). Parsed, + /// stored, and lint-validated, but **the verdict never reads it** (§5/§8, #5). #[serde(default, skip_serializing_if = "Vec::is_empty")] pub covers: Vec, + /// Every frontmatter key Surface does not name, preserved verbatim — the OKF "consumers MUST + /// preserve unknown keys" rule made structural. + #[serde(flatten, default)] + pub extra: serde_yaml::Mapping, } +// `Claim` stays strict: the `anchors:` items are Surface's own structured data (OKF knows nothing +// of them), so `deny_unknown_fields` keeps catching per-anchor typos (`hahs:`) as a hard error. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct Claim { + /// Stable identity, independent of prose/anchor text — the substrate of claim timelines and + /// attestation history. Written once by `surf verify` when absent, then **never regenerated**: + /// a prose or anchor edit keeps the same `id`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub id: Option, pub claim: String, pub at: At, /// The stored AST-canonical hash. `None` until `surf verify` first stamps it. #[serde(skip_serializing_if = "Option::is_none")] pub hash: Option, + /// The freshness OKF omits: when this claim was last *attested* against its code (ISO 8601) and + /// at which commit. Written by `surf verify` **only when the hash actually changes**, so a no-op + /// re-verify stays byte-identical. The *who* is intentionally not stored (git blame on the hub + /// records it) — keeping author emails out of tracked files. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub verified_at: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub verified_commit: Option, /// Opt-in: exclude string-literal *content* from this claim's hash, so a copy edit inside /// the anchored span doesn't re-open the gate (§6.1). The stored hash is computed in this /// mode, so it must travel with the claim. Defaults to `false`. @@ -42,6 +87,33 @@ pub struct Claim { pub ignore_literals: bool, } +fn default_type() -> String { + "concept".to_string() +} + +fn is_default_type(t: &str) -> bool { + t == "concept" +} + +/// OKF reserves two filenames that are structure, not concepts: `index.md` (a directory listing for +/// progressive disclosure) and `log.md` (a change history). They hold no claims, so Surface never +/// governs them and never blocks the gate when they lack frontmatter. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DocKind { + Concept, + Index, + Log, +} + +/// Classify a workspace-relative path by its basename (OKF reserved filenames). +pub fn doc_kind(rel: &str) -> DocKind { + match rel.rsplit(['/', '\\']).next().unwrap_or(rel) { + "index.md" => DocKind::Index, + "log.md" => DocKind::Log, + _ => DocKind::Concept, + } +} + /// One anchor (`at:`) is either a single span or a list; the claim is stale if *any* /// listed span changes (§6.3). #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -121,14 +193,26 @@ pub fn parse_hub(content: &str) -> Result { // scalars). These operate on the full hub text, locate the Nth `anchors:` item, and touch // exactly one line. `anchor_index` matches the parse order of `Frontmatter::anchors`. -/// Set (or insert) the `hash:` of the anchor at `anchor_index`. Returns the new file text, -/// or `None` if the frontmatter structure or index can't be located. -pub fn set_anchor_hash(file_text: &str, anchor_index: usize, new_hash: &str) -> Option { +/// Set (or insert) an arbitrary `key: value` line within the anchor item at `anchor_index`, +/// touching exactly that one line so a human review sees a minimal diff. Returns the new file +/// text, or `None` if the frontmatter structure or index can't be located. `set_anchor_hash` and +/// `surf verify`'s provenance stamping (`id`, `verified_*`) are thin wrappers over this. +pub fn set_anchor_field( + file_text: &str, + anchor_index: usize, + key: &str, + value: &str, +) -> Option { edit_anchor(file_text, anchor_index, |lines, item| { - set_key(lines, item, "hash", new_hash) + set_key(lines, item, key, value) }) } +/// Set (or insert) the `hash:` of the anchor at `anchor_index`. +pub fn set_anchor_hash(file_text: &str, anchor_index: usize, new_hash: &str) -> Option { + set_anchor_field(file_text, anchor_index, "hash", new_hash) +} + /// Rewrite a scalar `at:` of the anchor at `anchor_index` (used by `--follow`). Returns /// `None` if the structure can't be located or the `at:` is a list (not auto-followable). pub fn set_anchor_at(file_text: &str, anchor_index: usize, new_at: &str) -> Option { @@ -247,7 +331,10 @@ mod tests { #[test] fn parses_scalar_and_list_at() { let hub = parse_hub(VALID).unwrap(); - assert_eq!(hub.frontmatter.summary, "how auth refresh works"); + assert_eq!( + hub.frontmatter.summary.as_deref(), + Some("how auth refresh works") + ); assert_eq!(hub.frontmatter.anchors.len(), 2); let first = &hub.frontmatter.anchors[0]; @@ -285,14 +372,67 @@ mod tests { } #[test] - fn missing_required_field_is_yaml_error() { - let err = parse_hub("---\nanchors: []\n---\nbody\n").unwrap_err(); + fn okf_concept_without_summary_parses_as_pass_through() { + // OKF requires only `type` (defaulted). A concept with just OKF fields and no `anchors` + // parses fine — it is valid and simply ungoverned (nothing to hash). + let hub = + parse_hub("---\ntype: BigQuery Table\ndescription: one row per order\n---\nbody\n") + .unwrap(); + assert_eq!(hub.frontmatter.concept_type, "BigQuery Table"); + assert!(hub.frontmatter.summary.is_none()); + assert!(hub.frontmatter.anchors.is_empty()); + // `description` is an OKF field Surface doesn't name — captured in `extra`, not dropped. + assert!(hub.frontmatter.extra.contains_key("description")); + } + + #[test] + fn default_type_is_concept_and_not_serialized() { + // A hub that omits `type` parses with the `concept` default, and serializing it back does + // not introduce a `type:` key — existing hubs stay byte-unaffected. + let hub = parse_hub("---\nsummary: x\n---\nbody\n").unwrap(); + assert_eq!(hub.frontmatter.concept_type, "concept"); + let yaml = serde_yaml::to_string(&hub.frontmatter).unwrap(); + assert!( + !yaml.contains("type"), + "default type should not serialize: {yaml}" + ); + } + + #[test] + fn unknown_frontmatter_keys_are_preserved_on_round_trip() { + // OKF: consumers MUST preserve unknown keys. A doc authored in a doc system (Nansidian's + // author/created/pinned) round-trips with zero key loss. + let src = "---\ntype: Runbook\ntitle: Deploy\nauthor: rachel\ncreated: 2026-06-01\npinned: true\n---\nbody\n"; + let hub = parse_hub(src).unwrap(); + assert_eq!(hub.frontmatter.title.as_deref(), Some("Deploy")); + for k in ["author", "created", "pinned"] { + assert!(hub.frontmatter.extra.contains_key(k), "lost `{k}`"); + } + let yaml = serde_yaml::to_string(&hub.frontmatter).unwrap(); + let reparsed: Frontmatter = serde_yaml::from_str(&yaml).unwrap(); + assert_eq!(hub.frontmatter, reparsed); + } + + #[test] + fn unknown_claim_field_is_still_rejected() { + // Anchor items are Surface's own data, not OKF's — a per-anchor typo still fails closed. + let err = parse_hub( + "---\nsummary: s\nanchors:\n - claim: c\n at: a.rs > foo\n hahs: x\n---\n", + ) + .unwrap_err(); assert!( matches!(err, HubError::Yaml(_)), "expected Yaml error, got {err:?}" ); } + #[test] + fn doc_kind_classifies_reserved_filenames() { + assert_eq!(doc_kind("sales/index.md"), DocKind::Index); + assert_eq!(doc_kind("log.md"), DocKind::Log); + assert_eq!(doc_kind("tables/orders.md"), DocKind::Concept); + } + #[test] fn covers_field_is_accepted() { // `covers` is forward-declared per §9.1: parsed and stored, but inert in the verdict @@ -363,6 +503,21 @@ mod tests { assert_eq!(set_anchor_hash(HUB, 0, "oldhash").unwrap(), HUB); } + #[test] + fn set_anchor_field_inserts_provenance_and_reparses() { + // The generalized editor stamps `id`/`verified_*` into an anchor item and they parse back. + let out = set_anchor_field(HUB, 0, "id", "c_01hxyz").unwrap(); + let out = set_anchor_field(&out, 0, "verified_at", "2026-07-01T00:00:00Z").unwrap(); + let hub = parse_hub(&out).unwrap(); + assert_eq!(hub.frontmatter.anchors[0].id.as_deref(), Some("c_01hxyz")); + assert_eq!( + hub.frontmatter.anchors[0].verified_at.as_deref(), + Some("2026-07-01T00:00:00Z") + ); + // The other anchor is untouched. + assert_eq!(hub.frontmatter.anchors[1].id, None); + } + #[test] fn versioned_stamp_round_trips_as_a_string() { // A `2:`-prefixed v2 stamp is a plain YAML scalar (colon not followed by a space), so it diff --git a/surf-core/src/lib.rs b/surf-core/src/lib.rs index 291a93a..fe430d0 100644 --- a/surf-core/src/lib.rs +++ b/surf-core/src/lib.rs @@ -16,7 +16,10 @@ pub use hash::{ combine_site_hashes, diff_magnitude, format_stamp, hash_anchor, hash_anchor_raw, hash_anchor_with, parse_stamp, HashOpts, Magnitude, Recipe, }; -pub use hub::{parse_hub, set_anchor_at, set_anchor_hash, At, Claim, Frontmatter, Hub, HubError}; +pub use hub::{ + doc_kind, parse_hub, set_anchor_at, set_anchor_field, set_anchor_hash, At, Claim, DocKind, + Frontmatter, Hub, HubError, +}; pub use lang::Lang; pub use refs::{parse_ref, Ref, RefParseError}; pub use rename::find_renamed; diff --git a/surf-core/src/report.rs b/surf-core/src/report.rs index ae10e95..d5d045a 100644 --- a/surf-core/src/report.rs +++ b/surf-core/src/report.rs @@ -57,6 +57,10 @@ pub struct Divergence { pub claim: String, pub at: String, pub kind: DivergenceKind, + /// Stable claim id, present once the claim has been stamped by `surf verify` — the join key for + /// claim timelines and attestation history. Additive under the v1 contract; never gates. + #[serde(skip_serializing_if = "Option::is_none")] + pub id: Option, #[serde(skip_serializing_if = "Option::is_none")] pub old_hash: Option, #[serde(skip_serializing_if = "Option::is_none")]