diff --git a/data-fabrication-anomaly-assistant/README.md b/data-fabrication-anomaly-assistant/README.md new file mode 100644 index 00000000..4f147908 --- /dev/null +++ b/data-fabrication-anomaly-assistant/README.md @@ -0,0 +1,23 @@ +# Data Fabrication Anomaly Assistant + +This is a focused AI-Powered Research Assistant Suite slice for issue #16. It reviews synthetic manuscript data packets before AI peer-review output is trusted and emits release, review, or hold decisions for data-forensics red flags. + +The assistant checks: + +- repeated measurement rows that need raw-data verification +- invalid collection timestamps that cannot be audited +- terminal-digit preference that suggests rounding or synthetic entry +- unusually smooth measurement series +- perfect group separation that needs preregistered exclusion and raw-row provenance + +All fixtures are synthetic. The module does not call external AI services, publisher APIs, private datasets, credential stores, payment systems, or live manuscript systems. + +## Run + +```sh +npm run check +npm test +npm run demo +``` + +The demo writes reviewer artifacts under `reports/`. diff --git a/data-fabrication-anomaly-assistant/demo.js b/data-fabrication-anomaly-assistant/demo.js new file mode 100644 index 00000000..bbbf04a9 --- /dev/null +++ b/data-fabrication-anomaly-assistant/demo.js @@ -0,0 +1,31 @@ +import fs from "node:fs"; +import path from "node:path"; +import { + evaluateFabricationAnomalyPacket, + renderSummarySvg, + summarizeReview, +} from "./src/assistant.js"; +import { cleanPacket, riskyPacket } from "./src/samplePackets.js"; + +const reportsDir = path.join(process.cwd(), "reports"); +fs.mkdirSync(reportsDir, { recursive: true }); + +const clean = evaluateFabricationAnomalyPacket(cleanPacket); +const risky = evaluateFabricationAnomalyPacket(riskyPacket); + +fs.writeFileSync( + path.join(reportsDir, "clean-review.json"), + `${JSON.stringify(clean, null, 2)}\n`, +); +fs.writeFileSync( + path.join(reportsDir, "risky-review.json"), + `${JSON.stringify(risky, null, 2)}\n`, +); +fs.writeFileSync(path.join(reportsDir, "risky-review.md"), summarizeReview(risky)); +fs.writeFileSync(path.join(reportsDir, "summary.svg"), renderSummarySvg(risky)); + +console.log("Wrote data fabrication anomaly assistant reports:"); +console.log("- reports/clean-review.json"); +console.log("- reports/risky-review.json"); +console.log("- reports/risky-review.md"); +console.log("- reports/summary.svg"); diff --git a/data-fabrication-anomaly-assistant/make-demo-video.js b/data-fabrication-anomaly-assistant/make-demo-video.js new file mode 100644 index 00000000..0d1752be --- /dev/null +++ b/data-fabrication-anomaly-assistant/make-demo-video.js @@ -0,0 +1,50 @@ +import { spawnSync } from "node:child_process"; +import path from "node:path"; +import { riskyPacket } from "./src/samplePackets.js"; +import { evaluateFabricationAnomalyPacket } from "./src/assistant.js"; + +const reportsDir = path.join(process.cwd(), "reports"); +const demoMp4 = path.join(reportsDir, "demo.mp4"); +const resultPacket = evaluateFabricationAnomalyPacket(riskyPacket); + +function escapeDrawtext(text) { + return text.replaceAll("\\", "\\\\").replaceAll(":", "\\:").replaceAll("'", "\\'"); +} + +const font = "C\\:/Windows/Fonts/arial.ttf"; +const lines = [ + "Data Fabrication Anomaly Assistant", + `Decision ${resultPacket.decision} | Findings ${resultPacket.findingCount}`, + "Flags duplicate rows, invalid timestamps, digit preference, smooth series, and perfect separation.", + `Audit digest ${resultPacket.auditDigest}`, +]; +const drawText = lines + .map( + (line, index) => + `drawtext=fontfile='${font}':text='${escapeDrawtext(line)}':x=48:y=${64 + index * 72}:fontsize=${index === 0 ? 34 : 24}:fontcolor=${index === 1 ? "0xffdddd" : "white"}`, + ) + .join(","); + +const result = spawnSync( + "ffmpeg", + [ + "-y", + "-f", + "lavfi", + "-i", + "color=c=0x111827:s=960x540:r=12", + "-t", + "4", + "-vf", + `${drawText},format=yuv420p`, + "-an", + demoMp4, + ], + { encoding: "utf8" }, +); + +if (result.status !== 0) { + throw new Error(result.stderr || "ffmpeg failed to render demo.mp4"); +} + +console.log(`Wrote ${path.relative(process.cwd(), demoMp4)}`); diff --git a/data-fabrication-anomaly-assistant/package.json b/data-fabrication-anomaly-assistant/package.json new file mode 100644 index 00000000..6f386f49 --- /dev/null +++ b/data-fabrication-anomaly-assistant/package.json @@ -0,0 +1,13 @@ +{ + "name": "data-fabrication-anomaly-assistant", + "version": "1.0.0", + "private": true, + "description": "Synthetic data-forensics review assistant for SCIBASE AI research review packets.", + "type": "module", + "scripts": { + "check": "node --check src/assistant.js && node --check src/samplePackets.js && node --check test.js && node --check demo.js && node --check make-demo-video.js", + "test": "node test.js", + "demo": "node demo.js", + "demo:video": "node make-demo-video.js" + } +} diff --git a/data-fabrication-anomaly-assistant/reports/clean-review.json b/data-fabrication-anomaly-assistant/reports/clean-review.json new file mode 100644 index 00000000..a73db814 --- /dev/null +++ b/data-fabrication-anomaly-assistant/reports/clean-review.json @@ -0,0 +1,9 @@ +{ + "packetId": "clean-growth-study", + "manuscriptTitle": "Enzyme response in replicated plant growth chambers", + "decision": "RELEASE", + "findingCount": 0, + "findings": [], + "auditDigest": "5db12124295e2085", + "assistantScope": "Synthetic data-fabrication anomaly red flags before AI peer-review output is trusted." +} diff --git a/data-fabrication-anomaly-assistant/reports/demo.mp4 b/data-fabrication-anomaly-assistant/reports/demo.mp4 new file mode 100644 index 00000000..afae3d2d Binary files /dev/null and b/data-fabrication-anomaly-assistant/reports/demo.mp4 differ diff --git a/data-fabrication-anomaly-assistant/reports/risky-review.json b/data-fabrication-anomaly-assistant/reports/risky-review.json new file mode 100644 index 00000000..c6167afe --- /dev/null +++ b/data-fabrication-anomaly-assistant/reports/risky-review.json @@ -0,0 +1,45 @@ +{ + "packetId": "risky-cytokine-study", + "manuscriptTitle": "Cytokine response claims from a disputed assay batch", + "decision": "HOLD", + "findingCount": 5, + "findings": [ + { + "id": "duplicate-measurement-rows", + "severity": "high", + "title": "Repeated measurement rows need raw-data verification", + "evidence": "1 duplicate row pair(s): #12/#13", + "action": "Hold AI review output until source instruments, audit trail, and import logs explain the repeated rows." + }, + { + "id": "invalid-collection-timestamps", + "severity": "high", + "title": "Collection timestamps are not machine-auditable", + "evidence": "Invalid timestamp rows: #14", + "action": "Require normalized ISO-8601 collection timestamps before the assistant cites this dataset." + }, + { + "id": "terminal-digit-preference", + "severity": "medium", + "title": "Terminal digit distribution is unusually concentrated", + "evidence": "50% of numeric measurements end in 0", + "action": "Ask for raw instrument exports or an explanation of rounding rules before trusting fine-grained effects." + }, + { + "id": "over-smooth-measurement-series", + "severity": "medium", + "title": "Measurement series is smoother than expected for independent observations", + "evidence": "control delta sd/value sd=0", + "action": "Route to statistical reviewer for instrument drift, interpolation, or synthetic-row checks." + }, + { + "id": "perfect-between-group-separation", + "severity": "medium", + "title": "Groups separate perfectly without overlap", + "evidence": "control: n=6, range=10-12.5; treated: n=8, range=20-23", + "action": "Require preregistered exclusion rules and raw row provenance before strong causal or classification claims are released." + } + ], + "auditDigest": "6466f6cc655b23a7", + "assistantScope": "Synthetic data-fabrication anomaly red flags before AI peer-review output is trusted." +} diff --git a/data-fabrication-anomaly-assistant/reports/risky-review.md b/data-fabrication-anomaly-assistant/reports/risky-review.md new file mode 100644 index 00000000..cf5d36fd --- /dev/null +++ b/data-fabrication-anomaly-assistant/reports/risky-review.md @@ -0,0 +1,22 @@ +# Data fabrication anomaly review: Cytokine response claims from a disputed assay batch + +Decision: **HOLD** +Audit digest: `6466f6cc655b23a7` + +## Findings + +- **HIGH** Repeated measurement rows need raw-data verification (duplicate-measurement-rows) + Evidence: 1 duplicate row pair(s): #12/#13 + Action: Hold AI review output until source instruments, audit trail, and import logs explain the repeated rows. +- **HIGH** Collection timestamps are not machine-auditable (invalid-collection-timestamps) + Evidence: Invalid timestamp rows: #14 + Action: Require normalized ISO-8601 collection timestamps before the assistant cites this dataset. +- **MEDIUM** Terminal digit distribution is unusually concentrated (terminal-digit-preference) + Evidence: 50% of numeric measurements end in 0 + Action: Ask for raw instrument exports or an explanation of rounding rules before trusting fine-grained effects. +- **MEDIUM** Measurement series is smoother than expected for independent observations (over-smooth-measurement-series) + Evidence: control delta sd/value sd=0 + Action: Route to statistical reviewer for instrument drift, interpolation, or synthetic-row checks. +- **MEDIUM** Groups separate perfectly without overlap (perfect-between-group-separation) + Evidence: control: n=6, range=10-12.5; treated: n=8, range=20-23 + Action: Require preregistered exclusion rules and raw row provenance before strong causal or classification claims are released. diff --git a/data-fabrication-anomaly-assistant/reports/summary.svg b/data-fabrication-anomaly-assistant/reports/summary.svg new file mode 100644 index 00000000..fd82864f --- /dev/null +++ b/data-fabrication-anomaly-assistant/reports/summary.svg @@ -0,0 +1,11 @@ + + + + Data Fabrication Anomaly Assistant + Cytokine response claims from a disputed assay batch + + HOLD + Findings: 5 | Digest: 6466f6cc655b23a7 + HIGH - duplicate-measurement-rowsHIGH - invalid-collection-timestampsMEDIUM - terminal-digit-preferenceMEDIUM - over-smooth-measurement-series + Synthetic data only. No live manuscripts, private datasets, credentials, or external APIs. + diff --git a/data-fabrication-anomaly-assistant/src/assistant.js b/data-fabrication-anomaly-assistant/src/assistant.js new file mode 100644 index 00000000..13f61ef5 --- /dev/null +++ b/data-fabrication-anomaly-assistant/src/assistant.js @@ -0,0 +1,278 @@ +import crypto from "node:crypto"; + +const DECISION_RANK = { + RELEASE: 0, + REVIEW: 1, + HOLD: 2, +}; + +function strongerDecision(current, next) { + return DECISION_RANK[next] > DECISION_RANK[current] ? next : current; +} + +function round(value, places = 3) { + return Number(value.toFixed(places)); +} + +function median(values) { + const sorted = [...values].sort((a, b) => a - b); + const middle = Math.floor(sorted.length / 2); + return sorted.length % 2 + ? sorted[middle] + : (sorted[middle - 1] + sorted[middle]) / 2; +} + +function stddev(values) { + if (values.length < 2) return 0; + const mean = values.reduce((sum, value) => sum + value, 0) / values.length; + const variance = + values.reduce((sum, value) => sum + (value - mean) ** 2, 0) / + (values.length - 1); + return Math.sqrt(variance); +} + +function finding(id, severity, title, evidence, action) { + return { id, severity, title, evidence, action }; +} + +function digestPacket(packet) { + const stable = JSON.stringify(packet, Object.keys(packet).sort()); + return crypto.createHash("sha256").update(stable).digest("hex").slice(0, 16); +} + +function normalizeRecords(records = []) { + return records.map((record, index) => ({ + index, + participantId: String(record.participantId ?? `row-${index + 1}`), + group: String(record.group ?? "unknown"), + value: Number(record.value), + collectedAt: record.collectedAt, + operatorId: String(record.operatorId ?? "unknown"), + })); +} + +function findDuplicateRows(records) { + const seen = new Map(); + const duplicates = []; + for (const record of records) { + const key = [ + record.group, + record.value, + record.collectedAt, + record.operatorId, + ].join("|"); + if (seen.has(key)) { + duplicates.push([seen.get(key), record.index]); + } else { + seen.set(key, record.index); + } + } + + if (duplicates.length === 0) return null; + return finding( + "duplicate-measurement-rows", + "high", + "Repeated measurement rows need raw-data verification", + `${duplicates.length} duplicate row pair(s): ${duplicates + .map(([a, b]) => `#${a + 1}/#${b + 1}`) + .join(", ")}`, + "Hold AI review output until source instruments, audit trail, and import logs explain the repeated rows.", + ); +} + +function findImpossibleTimestamps(records) { + const invalidRows = records.filter((record) => { + const parsed = new Date(record.collectedAt); + return Number.isNaN(parsed.getTime()); + }); + + if (invalidRows.length === 0) return null; + return finding( + "invalid-collection-timestamps", + "high", + "Collection timestamps are not machine-auditable", + `Invalid timestamp rows: ${invalidRows.map((row) => `#${row.index + 1}`).join(", ")}`, + "Require normalized ISO-8601 collection timestamps before the assistant cites this dataset.", + ); +} + +function findDigitPreference(records) { + const numericRows = records.filter((record) => Number.isFinite(record.value)); + if (numericRows.length < 10) return null; + + const terminalCounts = new Map(); + for (const record of numericRows) { + const terminal = Math.abs(Math.round(record.value * 10)) % 10; + terminalCounts.set(terminal, (terminalCounts.get(terminal) ?? 0) + 1); + } + + const [digit, count] = [...terminalCounts.entries()].sort( + (a, b) => b[1] - a[1], + )[0]; + const share = count / numericRows.length; + + if (share < 0.45) return null; + return finding( + "terminal-digit-preference", + "medium", + "Terminal digit distribution is unusually concentrated", + `${round(share * 100, 1)}% of numeric measurements end in ${digit}`, + "Ask for raw instrument exports or an explanation of rounding rules before trusting fine-grained effects.", + ); +} + +function findTooSmoothSequence(records) { + const byGroup = Map.groupBy( + records.filter((record) => Number.isFinite(record.value)), + (record) => record.group, + ); + const suspicious = []; + + for (const [group, groupRecords] of byGroup.entries()) { + if (groupRecords.length < 6) continue; + const sorted = [...groupRecords].sort( + (a, b) => new Date(a.collectedAt) - new Date(b.collectedAt), + ); + const deltas = sorted + .slice(1) + .map((record, index) => round(record.value - sorted[index].value, 4)); + const deltaSpread = stddev(deltas); + const valueSpread = stddev(sorted.map((record) => record.value)); + if (valueSpread > 0 && deltaSpread / valueSpread < 0.045) { + suspicious.push(`${group} delta sd/value sd=${round(deltaSpread / valueSpread, 4)}`); + } + } + + if (suspicious.length === 0) return null; + return finding( + "over-smooth-measurement-series", + "medium", + "Measurement series is smoother than expected for independent observations", + suspicious.join("; "), + "Route to statistical reviewer for instrument drift, interpolation, or synthetic-row checks.", + ); +} + +function findPerfectSeparation(records) { + const byGroup = Map.groupBy( + records.filter((record) => Number.isFinite(record.value)), + (record) => record.group, + ); + if (byGroup.size < 2) return null; + const summaries = [...byGroup.entries()].map(([group, groupRecords]) => { + const values = groupRecords.map((record) => record.value); + return { + group, + n: values.length, + min: Math.min(...values), + max: Math.max(...values), + median: median(values), + }; + }); + + const ordered = summaries.sort((a, b) => a.median - b.median); + const gaps = ordered + .slice(1) + .map((summary, index) => summary.min - ordered[index].max); + const separated = gaps.some((gap) => gap > 0); + const allGroupsLargeEnough = summaries.every((summary) => summary.n >= 6); + + if (!separated || !allGroupsLargeEnough) return null; + return finding( + "perfect-between-group-separation", + "medium", + "Groups separate perfectly without overlap", + summaries + .map( + (summary) => + `${summary.group}: n=${summary.n}, range=${round(summary.min)}-${round(summary.max)}`, + ) + .join("; "), + "Require preregistered exclusion rules and raw row provenance before strong causal or classification claims are released.", + ); +} + +export function evaluateFabricationAnomalyPacket(packet) { + const records = normalizeRecords(packet.records); + const findings = [ + findDuplicateRows(records), + findImpossibleTimestamps(records), + findDigitPreference(records), + findTooSmoothSequence(records), + findPerfectSeparation(records), + ].filter(Boolean); + + let decision = "RELEASE"; + for (const item of findings) { + decision = strongerDecision( + decision, + item.severity === "high" ? "HOLD" : "REVIEW", + ); + } + + return { + packetId: packet.id, + manuscriptTitle: packet.manuscriptTitle, + decision, + findingCount: findings.length, + findings, + auditDigest: digestPacket({ + id: packet.id, + records, + findings: findings.map((item) => item.id), + }), + assistantScope: + "Synthetic data-fabrication anomaly red flags before AI peer-review output is trusted.", + }; +} + +export function summarizeReview(result) { + const lines = [ + `# Data fabrication anomaly review: ${result.manuscriptTitle}`, + "", + `Decision: **${result.decision}**`, + `Audit digest: \`${result.auditDigest}\``, + "", + ]; + + if (result.findings.length === 0) { + lines.push("No fabrication-anomaly red flags were detected in the synthetic packet."); + } else { + lines.push("## Findings", ""); + for (const item of result.findings) { + lines.push( + `- **${item.severity.toUpperCase()}** ${item.title} (${item.id})`, + ` Evidence: ${item.evidence}`, + ` Action: ${item.action}`, + ); + } + } + + return `${lines.join("\n")}\n`; +} + +export function renderSummarySvg(result) { + const hold = result.decision === "HOLD"; + const review = result.decision === "REVIEW"; + const accent = hold ? "#b91c1c" : review ? "#b45309" : "#047857"; + const rows = result.findings.slice(0, 4); + const findingRows = rows + .map( + (item, index) => + `${item.severity.toUpperCase()} - ${item.id}`, + ) + .join(""); + + return ` + + + Data Fabrication Anomaly Assistant + ${result.manuscriptTitle} + + ${result.decision} + Findings: ${result.findingCount} | Digest: ${result.auditDigest} + ${findingRows || 'No red flags detected.'} + Synthetic data only. No live manuscripts, private datasets, credentials, or external APIs. + +`; +} diff --git a/data-fabrication-anomaly-assistant/src/samplePackets.js b/data-fabrication-anomaly-assistant/src/samplePackets.js new file mode 100644 index 00000000..8af499a1 --- /dev/null +++ b/data-fabrication-anomaly-assistant/src/samplePackets.js @@ -0,0 +1,39 @@ +export const cleanPacket = { + id: "clean-growth-study", + manuscriptTitle: "Enzyme response in replicated plant growth chambers", + records: [ + { participantId: "P001", group: "control", value: 9.7, collectedAt: "2026-02-01T09:00:00Z", operatorId: "op-a" }, + { participantId: "P002", group: "control", value: 10.4, collectedAt: "2026-02-01T09:06:00Z", operatorId: "op-b" }, + { participantId: "P003", group: "control", value: 11.1, collectedAt: "2026-02-01T09:12:00Z", operatorId: "op-a" }, + { participantId: "P004", group: "control", value: 10.8, collectedAt: "2026-02-01T09:18:00Z", operatorId: "op-b" }, + { participantId: "P005", group: "control", value: 9.9, collectedAt: "2026-02-01T09:24:00Z", operatorId: "op-a" }, + { participantId: "P006", group: "control", value: 10.6, collectedAt: "2026-02-01T09:30:00Z", operatorId: "op-b" }, + { participantId: "P007", group: "treated", value: 11.3, collectedAt: "2026-02-01T10:00:00Z", operatorId: "op-a" }, + { participantId: "P008", group: "treated", value: 12.1, collectedAt: "2026-02-01T10:06:00Z", operatorId: "op-b" }, + { participantId: "P009", group: "treated", value: 10.9, collectedAt: "2026-02-01T10:12:00Z", operatorId: "op-a" }, + { participantId: "P010", group: "treated", value: 12.7, collectedAt: "2026-02-01T10:18:00Z", operatorId: "op-b" }, + { participantId: "P011", group: "treated", value: 11.8, collectedAt: "2026-02-01T10:24:00Z", operatorId: "op-a" }, + { participantId: "P012", group: "treated", value: 12.4, collectedAt: "2026-02-01T10:30:00Z", operatorId: "op-b" } + ] +}; + +export const riskyPacket = { + id: "risky-cytokine-study", + manuscriptTitle: "Cytokine response claims from a disputed assay batch", + records: [ + { participantId: "C001", group: "control", value: 10.0, collectedAt: "2026-03-01T09:00:00Z", operatorId: "op-a" }, + { participantId: "C002", group: "control", value: 10.5, collectedAt: "2026-03-01T09:05:00Z", operatorId: "op-a" }, + { participantId: "C003", group: "control", value: 11.0, collectedAt: "2026-03-01T09:10:00Z", operatorId: "op-a" }, + { participantId: "C004", group: "control", value: 11.5, collectedAt: "2026-03-01T09:15:00Z", operatorId: "op-a" }, + { participantId: "C005", group: "control", value: 12.0, collectedAt: "2026-03-01T09:20:00Z", operatorId: "op-a" }, + { participantId: "C006", group: "control", value: 12.5, collectedAt: "2026-03-01T09:25:00Z", operatorId: "op-a" }, + { participantId: "T001", group: "treated", value: 20.0, collectedAt: "2026-03-01T10:00:00Z", operatorId: "op-b" }, + { participantId: "T002", group: "treated", value: 20.5, collectedAt: "2026-03-01T10:05:00Z", operatorId: "op-b" }, + { participantId: "T003", group: "treated", value: 21.0, collectedAt: "2026-03-01T10:10:00Z", operatorId: "op-b" }, + { participantId: "T004", group: "treated", value: 21.5, collectedAt: "2026-03-01T10:15:00Z", operatorId: "op-b" }, + { participantId: "T005", group: "treated", value: 22.0, collectedAt: "2026-03-01T10:20:00Z", operatorId: "op-b" }, + { participantId: "T006", group: "treated", value: 22.5, collectedAt: "2026-03-01T10:25:00Z", operatorId: "op-b" }, + { participantId: "T007", group: "treated", value: 22.5, collectedAt: "2026-03-01T10:25:00Z", operatorId: "op-b" }, + { participantId: "T008", group: "treated", value: 23.0, collectedAt: "not-a-date", operatorId: "op-b" } + ] +}; diff --git a/data-fabrication-anomaly-assistant/test.js b/data-fabrication-anomaly-assistant/test.js new file mode 100644 index 00000000..42ba1b5e --- /dev/null +++ b/data-fabrication-anomaly-assistant/test.js @@ -0,0 +1,31 @@ +import assert from "node:assert/strict"; +import { + evaluateFabricationAnomalyPacket, + renderSummarySvg, + summarizeReview, +} from "./src/assistant.js"; +import { cleanPacket, riskyPacket } from "./src/samplePackets.js"; + +const clean = evaluateFabricationAnomalyPacket(cleanPacket); +assert.equal(clean.decision, "RELEASE"); +assert.equal(clean.findingCount, 0); +assert.match(clean.auditDigest, /^[0-9a-f]{16}$/); + +const risky = evaluateFabricationAnomalyPacket(riskyPacket); +assert.equal(risky.decision, "HOLD"); +assert.ok(risky.findings.some((item) => item.id === "duplicate-measurement-rows")); +assert.ok(risky.findings.some((item) => item.id === "invalid-collection-timestamps")); +assert.ok(risky.findings.some((item) => item.id === "terminal-digit-preference")); +assert.ok(risky.findings.some((item) => item.id === "over-smooth-measurement-series")); +assert.ok(risky.findings.some((item) => item.id === "perfect-between-group-separation")); + +const markdown = summarizeReview(risky); +assert.match(markdown, /Data fabrication anomaly review/); +assert.match(markdown, /HOLD/); +assert.match(markdown, /raw-data verification/); + +const svg = renderSummarySvg(risky); +assert.match(svg, /