Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ jobs:
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Placeholder
run: |
python --version
echo "Add linting and tests with the first implementation PR."
cache: pip
- name: Install
run: python -m pip install --upgrade pip && pip install -e '.[dev]'
- name: Lint
run: ruff check .
- name: Test
run: pytest
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.venv*/
__pycache__/
.pytest_cache/
.ruff_cache/
*.egg-info/
dist/
build/
40 changes: 40 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,43 @@ Raw conversations are considered highly sensitive. The MVP must treat raw text a
## First Milestone

Expose a local `/anonymize` endpoint with deterministic redaction tests and a documented anonymized output schema.

## Current Baseline

The repository now includes a runnable deterministic baseline. It detects emails, phone
numbers, URLs, usernames, and explicitly labeled account IDs. It processes requests in
memory and does not include a database or file persistence layer.

This baseline is not sufficient for production or real contributor data. Names, schools,
free-form addresses, indirect identifiers, and context-dependent identifiers require a
layered detector and expert privacy review.

### Run locally

```bash
python -m venv .venv
source .venv/bin/activate
pip install -e '.[dev]'
uvicorn makeaivisible_anonymizer.main:app --reload
```

Open `http://127.0.0.1:8000/docs` for the generated API documentation, or submit the
synthetic example directly:

```bash
curl -s http://127.0.0.1:8000/anonymize \
-H 'content-type: application/json' \
--data @examples/request.json
```

### Contract

`POST /anonymize` accepts a conversation identifier and a non-empty list of messages.
The response contains:

- `schema_version`: version of the anonymized record contract.
- `conversation_id`: caller-provided identifier; callers must not use a personal identifier.
- `messages`: roles and redacted message content.
- `redaction_report`: counts and source spans without the original sensitive values.

The generated OpenAPI schema is the canonical machine-readable contract for this MVP.
13 changes: 13 additions & 0 deletions examples/request.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"conversation_id": "synthetic-demo-1",
"messages": [
{
"role": "user",
"content": "My email is maya@example.org and my username is @maya_student."
},
{
"role": "assistant",
"content": "I will not contact you outside this conversation."
}
]
}
32 changes: 32 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[project]
name = "makeaivisible-anonymizer"
version = "0.1.0"
description = "Privacy-preserving conversation anonymization service"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"fastapi>=0.115,<1",
"pydantic>=2.10,<3",
"uvicorn[standard]>=0.34,<1",
]

[project.optional-dependencies]
dev = [
"httpx>=0.28,<1",
"pytest>=8.3,<9",
"ruff>=0.11,<1",
]

[tool.hatch.build.targets.wheel]
packages = ["src/makeaivisible_anonymizer"]

[tool.pytest.ini_options]
testpaths = ["tests"]

[tool.ruff]
line-length = 100
target-version = "py311"
3 changes: 3 additions & 0 deletions src/makeaivisible_anonymizer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Make AI Visible anonymization service."""

__version__ = "0.1.0"
46 changes: 46 additions & 0 deletions src/makeaivisible_anonymizer/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from collections import defaultdict

from fastapi import FastAPI

from . import __version__
from .models import (
AnonymizeRequest,
AnonymizeResponse,
ConversationMessage,
RedactionSummary,
)
from .redaction import count_by_type, redact_text

app = FastAPI(
title="Make AI Visible Anonymization Service",
description="Deterministic baseline for removing direct identifiers from conversations.",
version=__version__,
)


@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok", "version": __version__}


@app.post("/anonymize", response_model=AnonymizeResponse)
def anonymize(request: AnonymizeRequest) -> AnonymizeResponse:
counters: defaultdict[str, int] = defaultdict(int)
messages: list[ConversationMessage] = []
redactions = []

for index, message in enumerate(request.messages):
content, message_redactions = redact_text(message.content, index, counters)
messages.append(ConversationMessage(role=message.role, content=content))
redactions.extend(message_redactions)

return AnonymizeResponse(
schema_version="0.1.0",
conversation_id=request.conversation_id,
messages=messages,
redaction_report=RedactionSummary(
total=len(redactions),
by_type=count_by_type(redactions),
redactions=redactions,
),
)
51 changes: 51 additions & 0 deletions src/makeaivisible_anonymizer/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from enum import StrEnum

from pydantic import BaseModel, ConfigDict, Field, field_validator


class Role(StrEnum):
USER = "user"
ASSISTANT = "assistant"
SYSTEM = "system"


class ConversationMessage(BaseModel):
model_config = ConfigDict(extra="forbid")

role: Role
content: str = Field(min_length=1, max_length=50_000)

@field_validator("content")
@classmethod
def content_must_not_be_blank(cls, value: str) -> str:
if not value.strip():
raise ValueError("content must not be blank")
return value


class AnonymizeRequest(BaseModel):
model_config = ConfigDict(extra="forbid")

conversation_id: str = Field(min_length=1, max_length=128)
messages: list[ConversationMessage] = Field(min_length=1, max_length=1_000)


class Redaction(BaseModel):
message_index: int
entity_type: str
start: int
end: int
replacement: str


class RedactionSummary(BaseModel):
total: int
by_type: dict[str, int]
redactions: list[Redaction]


class AnonymizeResponse(BaseModel):
schema_version: str
conversation_id: str
messages: list[ConversationMessage]
redaction_report: RedactionSummary
82 changes: 82 additions & 0 deletions src/makeaivisible_anonymizer/redaction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import re
from collections import Counter, defaultdict
from dataclasses import dataclass

from .models import Redaction


@dataclass(frozen=True)
class Detector:
entity_type: str
pattern: re.Pattern[str]


DETECTORS = (
Detector("EMAIL", re.compile(r"(?<![\w.+-])[\w.+-]+@[\w-]+(?:\.[\w-]+)+", re.I)),
Detector("URL", re.compile(r"\b(?:https?://|www\.)[^\s<>]+", re.I)),
Detector(
"PHONE",
re.compile(r"(?<!\w)(?:\+?1[ .-]?)?(?:\(\d{3}\)|\d{3})[ .-]\d{3}[ .-]\d{4}(?!\w)"),
),
Detector("USERNAME", re.compile(r"(?<![\w@])@[A-Za-z0-9_]{2,32}\b")),
Detector(
"ACCOUNT_ID",
re.compile(r"\b(?:account|user|member)[ _-]?id\s*[:=#]\s*[A-Za-z0-9_-]{4,64}\b", re.I),
),
)


@dataclass(frozen=True)
class Match:
entity_type: str
start: int
end: int


def _matches(text: str) -> list[Match]:
candidates = [
Match(detector.entity_type, match.start(), match.end())
for detector in DETECTORS
for match in detector.pattern.finditer(text)
]
candidates.sort(key=lambda item: (item.start, -(item.end - item.start)))

accepted: list[Match] = []
for candidate in candidates:
if any(candidate.start < item.end and item.start < candidate.end for item in accepted):
continue
accepted.append(candidate)
return accepted


def redact_text(
text: str,
message_index: int,
counters: defaultdict[str, int],
) -> tuple[str, list[Redaction]]:
matches = _matches(text)
output: list[str] = []
redactions: list[Redaction] = []
cursor = 0

for match in matches:
counters[match.entity_type] += 1
replacement = f"[{match.entity_type}_{counters[match.entity_type]}]"
output.extend((text[cursor : match.start], replacement))
redactions.append(
Redaction(
message_index=message_index,
entity_type=match.entity_type,
start=match.start,
end=match.end,
replacement=replacement,
)
)
cursor = match.end

output.append(text[cursor:])
return "".join(output), redactions


def count_by_type(redactions: list[Redaction]) -> dict[str, int]:
return dict(sorted(Counter(item.entity_type for item in redactions).items()))
86 changes: 86 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from fastapi.testclient import TestClient

from makeaivisible_anonymizer.main import app


client = TestClient(app)


def test_health() -> None:
response = client.get("/health")

assert response.status_code == 200
assert response.json() == {"status": "ok", "version": "0.1.0"}


def test_anonymizes_synthetic_conversation_without_echoing_values() -> None:
sensitive_values = (
"maya@example.org",
"+1 604-555-0199",
"https://example.org/profile/maya",
"@maya_student",
"user_id: abc_12345",
)
response = client.post(
"/anonymize",
json={
"conversation_id": "synthetic-demo-1",
"messages": [
{"role": "user", "content": f"Email me at {sensitive_values[0]}."},
{
"role": "assistant",
"content": "Contact " + ", ".join(sensitive_values[1:]),
},
],
},
)

assert response.status_code == 200
payload = response.json()
serialized = response.text
assert payload["schema_version"] == "0.1.0"
assert payload["redaction_report"]["total"] == 5
assert payload["redaction_report"]["by_type"] == {
"ACCOUNT_ID": 1,
"EMAIL": 1,
"PHONE": 1,
"URL": 1,
"USERNAME": 1,
}
for value in sensitive_values:
assert value not in serialized


def test_replacements_are_stable_within_a_request() -> None:
response = client.post(
"/anonymize",
json={
"conversation_id": "synthetic-demo-2",
"messages": [
{"role": "user", "content": "First: a@example.org"},
{"role": "user", "content": "Second: b@example.org"},
],
},
)

assert response.status_code == 200
messages = response.json()["messages"]
assert messages[0]["content"] == "First: [EMAIL_1]"
assert messages[1]["content"] == "Second: [EMAIL_2]"


def test_rejects_blank_messages_and_unknown_fields() -> None:
blank = client.post(
"/anonymize",
json={"conversation_id": "bad", "messages": [{"role": "user", "content": " "}]},
)
unknown = client.post(
"/anonymize",
json={
"conversation_id": "bad",
"messages": [{"role": "user", "content": "hello", "raw_name": "not allowed"}],
},
)

assert blank.status_code == 422
assert unknown.status_code == 422
Loading
Loading