Skip to content
158 changes: 156 additions & 2 deletions hospexplorer/ask/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@
from django.shortcuts import render
from django.urls import path, reverse

from ask.models import Conversation, TermsAcceptance, QARecord, SimWorkflow, WebsiteResource, PDFResource
from ask.models import (
Conversation,
TermsAcceptance,
QARecord,
SimWorkflow,
WebsiteResource,
PDFResource,
DocumentType,
DocumentAuthorInstitution,
InstitutionType,
)
from ask.admin_csv import import_names_csv, validate_partial_date
from ask.kb_connector import delete_kb_document
from ask.tasks import run_kb_resource_upload

Expand Down Expand Up @@ -205,16 +216,98 @@ def delete_queryset(self, request, queryset):
return


class LookupCSVImportMixin:
"""Adds an Import CSV button + upload view to a lookup ModelAdmin.

CSV is single-column name. Duplicates are skipped, header row optional.
"""

change_list_template = "admin/ask/lookup_change_list.html"

def get_urls(self):
urls = super().get_urls()
info = (self.model._meta.app_label, self.model._meta.model_name)
return [
path(
"import-csv/",
self.admin_site.admin_view(self.import_csv_view),
name=f"{info[0]}_{info[1]}_import_csv",
),
] + urls

def import_csv_view(self, request):
info = (self.model._meta.app_label, self.model._meta.model_name)
changelist_url = reverse(f"admin:{info[0]}_{info[1]}_changelist")

if request.method == "POST":
file_obj = request.FILES.get("csv_file")
if file_obj is None:
self.message_user(request, "No file provided.", level="error")
elif not file_obj.name.lower().endswith(".csv"):
self.message_user(request, "File must have a .csv extension.", level="error")
else:
try:
created, skipped = import_names_csv(self.model, file_obj)
except Exception as e:
logger.exception("CSV import failed for %s", self.model.__name__)
self.message_user(request, f"Import failed: {e}", level="error")
else:
self.message_user(
request,
f"Imported {created} new {self.model._meta.verbose_name_plural} "
f"(skipped {skipped} duplicate or empty rows).",
)
return HttpResponseRedirect(changelist_url)

context = {
**self.admin_site.each_context(request),
"title": f"Import {self.model._meta.verbose_name_plural} from CSV",
"opts": self.model._meta,
"changelist_url": changelist_url,
}
return render(request, "admin/ask/lookup_csv_import.html", context)


@admin.register(DocumentType)
class DocumentTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(DocumentAuthorInstitution)
class DocumentAuthorInstitutionAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(InstitutionType)
class InstitutionTypeAdmin(LookupCSVImportMixin, admin.ModelAdmin):
list_display = ("name",)
search_fields = ("name",)


@admin.register(WebsiteResource)
class WebsiteResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
list_display = ("title", "url", "creator", "status", "modified_at")
list_filter = ("status",)
search_fields = ("title", "url")
readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
fieldsets = (
(None, {"fields": ("title", "description", "url")}),
("Metadata", {"fields": (
"date_published",
"document_type", "document_author_institution", "institution_type",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
"created_at", "modified_at", "creator", "modifier",
)}),
)
help_texts = {
"title": "A short name to identify this website resource.",
"description": "Optional details about what this website covers.",
"url": "The URL the LLM will use as context when answering questions.",
"date_published": "Partial ISO date: YYYY, YYYY-MM, or YYYY-MM-DD. Leave blank if unknown.",
}

def get_form(self, request, obj=None, **kwargs):
Expand Down Expand Up @@ -248,16 +341,70 @@ def save_model(self, request, obj, form, change):
)


# Optional metadata columns the zip-CSV importer reads onto each PDFResource.
# Controlled-list values create the matching lookup row the first time they
# appear, so the available options grow from what the imports actually use.
ZIP_CSV_LOOKUP_COLUMNS = {
"document_type": DocumentType,
"document_author_institution": DocumentAuthorInstitution,
"institution_type": InstitutionType,
}


def _apply_zip_csv_metadata(obj, row):
"""Populate a resource's metadata fields from one zip-CSV row.

Every metadata column is optional. Returns a list of human-readable
warnings for values that could not be applied — the row is still imported,
just with that field left blank.
"""
warnings = []

date_raw = (row.get("date_published") or "").strip()
if date_raw:
try:
obj.date_published = validate_partial_date(date_raw)
except ValueError:
warnings.append(
f"invalid date_published '{date_raw}' "
"(use YYYY, YYYY-MM or YYYY-MM-DD); left blank"
)

for column, model in ZIP_CSV_LOOKUP_COLUMNS.items():
value = (row.get(column) or "").strip()
if not value:
continue
if len(value) > 255:
warnings.append(f"{column} value exceeds 255 characters; left blank")
continue
lookup, _ = model.objects.get_or_create(name=value)
setattr(obj, column, lookup)

return warnings


@admin.register(PDFResource)
class PDFResourceAdmin(KBDeleteAdminMixin, admin.ModelAdmin):
list_display = ("title", "file", "creator", "status", "modified_at")
list_filter = ("status",)
search_fields = ("title",)
readonly_fields = ("created_at", "modified_at", "creator", "modifier", "mcp_kb_document_id", "status", "status_message")
fieldsets = (
(None, {"fields": ("title", "description", "file")}),
("Metadata", {"fields": (
"date_published",
"document_type", "document_author_institution", "institution_type",
)}),
("Status", {"fields": (
"status", "status_message", "mcp_kb_document_id",
"created_at", "modified_at", "creator", "modifier",
)}),
)
help_texts = {
"title": "A short name to identify this PDF resource.",
"description": "Optional details about what this PDF covers.",
"file": "The PDF file the LLM will use as context when answering questions.",
"date_published": "Partial ISO date: YYYY, YYYY-MM, or YYYY-MM-DD. Leave blank if unknown.",
}

# Column names the bulk-import CSV must define (first = zip member, second = resource title)
Expand Down Expand Up @@ -356,7 +503,12 @@ def _is_real(name):

csv_text = archive.read(csv_names[0]).decode("utf-8-sig")
reader = csv.DictReader(io.StringIO(csv_text))
csv_columns = {(name or "").strip() for name in (reader.fieldnames or [])}
# strip header names so the column check and per-row lookups use
# the same keys; otherwise a header like "filename, title" leaves
# stray spaces and every row reads as missing its required fields
if reader.fieldnames:
reader.fieldnames = [(name or "").strip() for name in reader.fieldnames]
csv_columns = set(reader.fieldnames or [])
if not required_columns.issubset(csv_columns):
missing = ", ".join(sorted(required_columns - csv_columns))
messages.error(request, f"CSV is missing required columns: {missing}.")
Expand Down Expand Up @@ -411,6 +563,8 @@ def _is_real(name):
status=PDFResource.Status.PROCESSING,
status_message="Queued for Knowledge Base upload.",
)
for warning in _apply_zip_csv_metadata(obj, row):
messages.warning(request, f"Row {total}: {warning}")
obj.file.save(basename, ContentFile(pdf_bytes), save=True)
saved += 1
existing_pdfs.add((basename, title))
Expand Down
49 changes: 49 additions & 0 deletions hospexplorer/ask/admin_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import csv
import datetime
import io


def validate_partial_date(value):
"""Check that ``value`` is a partial ISO 8601 date and return it stripped.

Accepts ``YYYY``, ``YYYY-MM`` or ``YYYY-MM-DD`` (zero-padded). Calendar
correctness is delegated to ``datetime.date.fromisoformat`` by padding
the missing components with ``-01``. Empty / whitespace input returns
``""``; any other malformed value raises ``ValueError``.
"""
s = (value or "").strip()
if not s:
return ""
if len(s) == 4:
datetime.date.fromisoformat(s + "-01-01")
elif len(s) == 7:
datetime.date.fromisoformat(s + "-01")
elif len(s) == 10:
datetime.date.fromisoformat(s)
else:
raise ValueError(f"not a partial ISO date: {value!r}")
return s


def import_names_csv(model, file_obj):
"""Import a one-column CSV into a model with a ``name`` field.

Returns ``(created, skipped)``. Blank rows, a leading header row of ``name``,
and rows whose name already exists in the table are all counted as skipped.
"""
text = file_obj.read().decode("utf-8-sig", errors="replace")
reader = csv.reader(io.StringIO(text))

created = 0
skipped = 0
for row in reader:
name = row[0].strip() if row else ""
if not name or name.lower() == "name":
skipped += 1
continue
_, was_created = model.objects.get_or_create(name=name)
if was_created:
created += 1
else:
skipped += 1
return created, skipped
17 changes: 11 additions & 6 deletions hospexplorer/ask/kb_connector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import logging
import time

Expand Down Expand Up @@ -31,22 +32,24 @@ def list_kb_documents(page=1, page_size=10):
return response.json()


def add_website_to_kb(url):
def add_website_to_kb(url, metadata=None):
"""Send a website URL to the MCP KB server for ingestion.

Calls POST /docs/website/add?url={url} on the MCP KB server.
The KB server fetches the page, chunks it, generates embeddings,
and stores it for semantic search.
``metadata`` (if provided) is sent as a JSON body ``{"metadata": ...}`` so
the KB server can store it on the Document row.
"""
headers = {
"Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
"Content-Type": "application/json",
}
endpoint = f"{settings.KB_MCP_HOST}/docs/website/add"

with httpx.Client() as client:
response = client.post(
endpoint,
params={"url": url},
json={"metadata": metadata} if metadata is not None else {},
headers=headers,
timeout=settings.KB_MCP_TIMEOUT,
)
Expand All @@ -55,12 +58,12 @@ def add_website_to_kb(url):
return response.json()


def add_pdf_to_kb(file_bytes, filename, title, url=None):
def add_pdf_to_kb(file_bytes, filename, title, url=None, metadata=None):
"""Upload a PDF to the MCP KB server for ingestion.

Calls POST /docs/pdf/add on the MCP KB server with multipart form data.
The KB server extracts text, chunks it, generates embeddings,
and stores it for semantic search.
metadata (if provided) is JSON-encoded into a metadata form field so
it can travel alongside the file.
"""
headers = {
"Authorization": f"Bearer {settings.KB_MCP_JWT_TOKEN}",
Expand All @@ -70,6 +73,8 @@ def add_pdf_to_kb(file_bytes, filename, title, url=None):
data = {"title": title}
if url:
data["url"] = url
if metadata is not None:
data["metadata"] = json.dumps(metadata)

# Only retry on transport errors (the request never completed) — a timeout
# likely means the KB received the file and is still processing it, so
Expand Down
Loading