makeaivisible · alp-topcu · Jun 30, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -13,7 +13,10 @@ jobs:
       - uses: actions/setup-python@v5
         with:
           python-version: "3.12"
-      - name: Placeholder
-        run: |
-          python --version
-          echo "Add linting and tests with the first implementation PR."
+          cache: pip
+      - name: Install
+        run: python -m pip install --upgrade pip && pip install -e '.[dev]'
+      - name: Lint
+        run: ruff check .
+      - name: Test
+        run: pytest
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.venv*/
+__pycache__/
+.pytest_cache/
+.ruff_cache/
+*.egg-info/
+dist/
+build/
diff --git a/README.md b/README.md
@@ -27,3 +27,43 @@ Raw conversations are considered highly sensitive. The MVP must treat raw text a
 ## First Milestone
 
 Expose a local `/anonymize` endpoint with deterministic redaction tests and a documented anonymized output schema.
+
+## Current Baseline
+
+The repository now includes a runnable deterministic baseline. It detects emails, phone
+numbers, URLs, usernames, and explicitly labeled account IDs. It processes requests in
+memory and does not include a database or file persistence layer.
+
+This baseline is not sufficient for production or real contributor data. Names, schools,
+free-form addresses, indirect identifiers, and context-dependent identifiers require a
+layered detector and expert privacy review.
+
+### Run locally
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -e '.[dev]'
+uvicorn makeaivisible_anonymizer.main:app --reload
+```
+
+Open `http://127.0.0.1:8000/docs` for the generated API documentation, or submit the
+synthetic example directly:
+
+```bash
+curl -s http://127.0.0.1:8000/anonymize \
+  -H 'content-type: application/json' \
+  --data @examples/request.json
+```
+
+### Contract
+
+`POST /anonymize` accepts a conversation identifier and a non-empty list of messages.
+The response contains:
+
+- `schema_version`: version of the anonymized record contract.
+- `conversation_id`: caller-provided identifier; callers must not use a personal identifier.
+- `messages`: roles and redacted message content.
+- `redaction_report`: counts and source spans without the original sensitive values.
+
+The generated OpenAPI schema is the canonical machine-readable contract for this MVP.
diff --git a/examples/request.json b/examples/request.json
@@ -0,0 +1,13 @@
+{
+  "conversation_id": "synthetic-demo-1",
+  "messages": [
+    {
+      "role": "user",
+      "content": "My email is maya@example.org and my username is @maya_student."
+    },
+    {
+      "role": "assistant",
+      "content": "I will not contact you outside this conversation."
+    }
+  ]
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "makeaivisible-anonymizer"
+version = "0.1.0"
+description = "Privacy-preserving conversation anonymization service"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "fastapi>=0.115,<1",
+  "pydantic>=2.10,<3",
+  "uvicorn[standard]>=0.34,<1",
+]
+
+[project.optional-dependencies]
+dev = [
+  "httpx>=0.28,<1",
+  "pytest>=8.3,<9",
+  "ruff>=0.11,<1",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/makeaivisible_anonymizer"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py311"
diff --git a/src/makeaivisible_anonymizer/__init__.py b/src/makeaivisible_anonymizer/__init__.py
@@ -0,0 +1,3 @@
+"""Make AI Visible anonymization service."""
+
+__version__ = "0.1.0"
diff --git a/src/makeaivisible_anonymizer/main.py b/src/makeaivisible_anonymizer/main.py
@@ -0,0 +1,46 @@
+from collections import defaultdict
+
+from fastapi import FastAPI
+
+from . import __version__
+from .models import (
+    AnonymizeRequest,
+    AnonymizeResponse,
+    ConversationMessage,
+    RedactionSummary,
+)
+from .redaction import count_by_type, redact_text
+
+app = FastAPI(
+    title="Make AI Visible Anonymization Service",
+    description="Deterministic baseline for removing direct identifiers from conversations.",
+    version=__version__,
+)
+
+
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "ok", "version": __version__}
+
+
+@app.post("/anonymize", response_model=AnonymizeResponse)
+def anonymize(request: AnonymizeRequest) -> AnonymizeResponse:
+    counters: defaultdict[str, int] = defaultdict(int)
+    messages: list[ConversationMessage] = []
+    redactions = []
+
+    for index, message in enumerate(request.messages):
+        content, message_redactions = redact_text(message.content, index, counters)
+        messages.append(ConversationMessage(role=message.role, content=content))
+        redactions.extend(message_redactions)
+
+    return AnonymizeResponse(
+        schema_version="0.1.0",
+        conversation_id=request.conversation_id,
+        messages=messages,
+        redaction_report=RedactionSummary(
+            total=len(redactions),
+            by_type=count_by_type(redactions),
+            redactions=redactions,
+        ),
+    )
diff --git a/src/makeaivisible_anonymizer/models.py b/src/makeaivisible_anonymizer/models.py
@@ -0,0 +1,51 @@
+from enum import StrEnum
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+
+
+class Role(StrEnum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+
+
+class ConversationMessage(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    role: Role
+    content: str = Field(min_length=1, max_length=50_000)
+
+    @field_validator("content")
+    @classmethod
+    def content_must_not_be_blank(cls, value: str) -> str:
+        if not value.strip():
+            raise ValueError("content must not be blank")
+        return value
+
+
+class AnonymizeRequest(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    conversation_id: str = Field(min_length=1, max_length=128)
+    messages: list[ConversationMessage] = Field(min_length=1, max_length=1_000)
+
+
+class Redaction(BaseModel):
+    message_index: int
+    entity_type: str
+    start: int
+    end: int
+    replacement: str
+
+
+class RedactionSummary(BaseModel):
+    total: int
+    by_type: dict[str, int]
+    redactions: list[Redaction]
+
+
+class AnonymizeResponse(BaseModel):
+    schema_version: str
+    conversation_id: str
+    messages: list[ConversationMessage]
+    redaction_report: RedactionSummary
diff --git a/src/makeaivisible_anonymizer/redaction.py b/src/makeaivisible_anonymizer/redaction.py
@@ -0,0 +1,82 @@
+import re
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+
+from .models import Redaction
+
+
+@dataclass(frozen=True)
+class Detector:
+    entity_type: str
+    pattern: re.Pattern[str]
+
+
+DETECTORS = (
+    Detector("EMAIL", re.compile(r"(?<![\w.+-])[\w.+-]+@[\w-]+(?:\.[\w-]+)+", re.I)),
+    Detector("URL", re.compile(r"\b(?:https?://|www\.)[^\s<>]+", re.I)),
+    Detector(
+        "PHONE",
+        re.compile(r"(?<!\w)(?:\+?1[ .-]?)?(?:\(\d{3}\)|\d{3})[ .-]\d{3}[ .-]\d{4}(?!\w)"),
+    ),
+    Detector("USERNAME", re.compile(r"(?<![\w@])@[A-Za-z0-9_]{2,32}\b")),
+    Detector(
+        "ACCOUNT_ID",
+        re.compile(r"\b(?:account|user|member)[ _-]?id\s*[:=#]\s*[A-Za-z0-9_-]{4,64}\b", re.I),
+    ),
+)
+
+
+@dataclass(frozen=True)
+class Match:
+    entity_type: str
+    start: int
+    end: int
+
+
+def _matches(text: str) -> list[Match]:
+    candidates = [
+        Match(detector.entity_type, match.start(), match.end())
+        for detector in DETECTORS
+        for match in detector.pattern.finditer(text)
+    ]
+    candidates.sort(key=lambda item: (item.start, -(item.end - item.start)))
+
+    accepted: list[Match] = []
+    for candidate in candidates:
+        if any(candidate.start < item.end and item.start < candidate.end for item in accepted):
+            continue
+        accepted.append(candidate)
+    return accepted
+
+
+def redact_text(
+    text: str,
+    message_index: int,
+    counters: defaultdict[str, int],
+) -> tuple[str, list[Redaction]]:
+    matches = _matches(text)
+    output: list[str] = []
+    redactions: list[Redaction] = []
+    cursor = 0
+
+    for match in matches:
+        counters[match.entity_type] += 1
+        replacement = f"[{match.entity_type}_{counters[match.entity_type]}]"
+        output.extend((text[cursor : match.start], replacement))
+        redactions.append(
+            Redaction(
+                message_index=message_index,
+                entity_type=match.entity_type,
+                start=match.start,
+                end=match.end,
+                replacement=replacement,
+            )
+        )
+        cursor = match.end
+
+    output.append(text[cursor:])
+    return "".join(output), redactions
+
+
+def count_by_type(redactions: list[Redaction]) -> dict[str, int]:
+    return dict(sorted(Counter(item.entity_type for item in redactions).items()))
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -0,0 +1,86 @@
+from fastapi.testclient import TestClient
+
+from makeaivisible_anonymizer.main import app
+
+
+client = TestClient(app)
+
+
+def test_health() -> None:
+    response = client.get("/health")
+
+    assert response.status_code == 200
+    assert response.json() == {"status": "ok", "version": "0.1.0"}
+
+
+def test_anonymizes_synthetic_conversation_without_echoing_values() -> None:
+    sensitive_values = (
+        "maya@example.org",
+        "+1 604-555-0199",
+        "https://example.org/profile/maya",
+        "@maya_student",
+        "user_id: abc_12345",
+    )
+    response = client.post(
+        "/anonymize",
+        json={
+            "conversation_id": "synthetic-demo-1",
+            "messages": [
+                {"role": "user", "content": f"Email me at {sensitive_values[0]}."},
+                {
+                    "role": "assistant",
+                    "content": "Contact " + ", ".join(sensitive_values[1:]),
+                },
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    payload = response.json()
+    serialized = response.text
+    assert payload["schema_version"] == "0.1.0"
+    assert payload["redaction_report"]["total"] == 5
+    assert payload["redaction_report"]["by_type"] == {
+        "ACCOUNT_ID": 1,
+        "EMAIL": 1,
+        "PHONE": 1,
+        "URL": 1,
+        "USERNAME": 1,
+    }
+    for value in sensitive_values:
+        assert value not in serialized
+
+
+def test_replacements_are_stable_within_a_request() -> None:
+    response = client.post(
+        "/anonymize",
+        json={
+            "conversation_id": "synthetic-demo-2",
+            "messages": [
+                {"role": "user", "content": "First: a@example.org"},
+                {"role": "user", "content": "Second: b@example.org"},
+            ],
+        },
+    )
+
+    assert response.status_code == 200
+    messages = response.json()["messages"]
+    assert messages[0]["content"] == "First: [EMAIL_1]"
+    assert messages[1]["content"] == "Second: [EMAIL_2]"
+
+
+def test_rejects_blank_messages_and_unknown_fields() -> None:
+    blank = client.post(
+        "/anonymize",
+        json={"conversation_id": "bad", "messages": [{"role": "user", "content": "   "}]},
+    )
+    unknown = client.post(
+        "/anonymize",
+        json={
+            "conversation_id": "bad",
+            "messages": [{"role": "user", "content": "hello", "raw_name": "not allowed"}],
+        },
+    )
+
+    assert blank.status_code == 422
+    assert unknown.status_code == 422
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Make AI Visible anonymization service."""

		__version__ = "0.1.0"