From e3db1fc377370c14edebf2f9b30659381809a911 Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:35:25 -0400
Subject: [PATCH 1/8] build(pre-commit): add max-file-size ratchet hook (700
 LOC cap)

New scripts/codegen/check_file_size.py enforces a per-file line ceiling
over src/ and scripts/ so module splits don't silently regress into
monoliths. Existing oversize modules are grandfathered in the hook's
exclude list until they are split.
---
 .pre-commit-config.yaml            | 17 +++++++++
 scripts/codegen/check_file_size.py | 55 ++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100755 scripts/codegen/check_file_size.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e47d6e0..a753356 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -113,6 +113,23 @@ repos:
         language: system
         files: ^benchmarks/.*\.go$
         types: [go]
+      - id: max-file-size
+        name: Enforce 700 LOC max per source file (ratchet)
+        entry: python3 scripts/codegen/check_file_size.py --max 700
+        language: system
+        files: ^(src|scripts)/.*\.py$
+        # Ratchet: modules already over the cap are grandfathered until
+        # they get split. Do NOT add new entries here — split the module.
+        exclude: |
+          (?x)^(
+              src/ultralytics_patches/|
+              src/clients/opensearch\.py|
+              src/services/visual_search\.py|
+              src/services/face_identity\.py|
+              src/services/duplicate_detection\.py|
+              src/routers/ingest\.py|
+              src/routers/search\.py
+          )
 
   # Commit message linting (conventional commits)
   # Note: Disabled for now - requires separate hook installation with:
diff --git a/scripts/codegen/check_file_size.py b/scripts/codegen/check_file_size.py
new file mode 100755
index 0000000..fd16354
--- /dev/null
+++ b/scripts/codegen/check_file_size.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Pre-commit regression guard: cap per-file line count.
+
+Prevents source files from regrowing past a threshold so focused-module
+splits don't silently revert. Run by the ``max-file-size`` pre-commit
+hook over ``src/`` and ``scripts/``.
+
+The threshold is set by ``--max`` (default 700). Files listed in the
+hook's ``exclude`` regex are not passed in at all; this script's only
+job is the line-count check.
+
+Exit code 1 if any input file exceeds the cap; the violating files +
+sizes are written to stderr.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def line_count(path: Path) -> int:
+    try:
+        return sum(1 for _ in path.open('rb'))
+    except OSError:
+        return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--max', type=int, default=700, help='Max LOC per file.')
+    parser.add_argument('files', nargs='*', type=Path)
+    args = parser.parse_args()
+
+    violations: list[tuple[Path, int]] = []
+    for p in args.files:
+        n = line_count(p)
+        if n > args.max:
+            violations.append((p, n))
+
+    if violations:
+        sys.stderr.write(
+            f'ERROR: {len(violations)} file(s) exceed the {args.max} LOC ceiling.\n'
+            'Split into focused sub-modules instead of growing monoliths.\n'
+            'Violations:\n'
+        )
+        for p, n in violations:
+            sys.stderr.write(f'  {p.as_posix()}: {n} lines (max {args.max})\n')
+        return 1
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())

From 1117ab839989a1e1adbe1c5071a7aed2fe8f4f0a Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:37:35 -0400
Subject: [PATCH 2/8] refactor(logging): move request-id contextvar to
 core.logging; fix foreign_pre_chain formatter bug

- request_id_ctx / get_request_id now live in src.core.logging (with new
  bind_request_id / clear_request_id helpers) so service modules and
  out-of-process workers can import them without pulling in the FastAPI
  app; src.main re-exports for backward compatibility.
- merge_contextvars added first in the processor chain so request_id
  auto-attaches to every structlog event on the task.
- ProcessorFormatter.wrap_for_formatter removed from foreign_pre_chain:
  it wraps the event dict in a tuple and is only valid as the LAST
  structlog-native step; in the pre-chain it crashed stdlib log records
  (uvicorn, opensearch-py) with 'tuple' object does not support item
  deletion.
---
 src/core/logging.py | 66 ++++++++++++++++++++++++++++++++++++---------
 src/main.py         | 30 ++++++++++-----------
 2 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/src/core/logging.py b/src/core/logging.py
index 82074a6..d28de05 100644
--- a/src/core/logging.py
+++ b/src/core/logging.py
@@ -10,11 +10,46 @@
 
 import logging
 import sys
+from contextvars import ContextVar
 from typing import Any
 
 import structlog
 
 
+# Request-scoped correlation ID. Set by the HTTP middleware in
+# ``src.main`` (FastAPI) and by worker processes per task. Lives here
+# (not in src.main) so service-layer modules can import it without
+# pulling the FastAPI app at import time — background workers run
+# outside the FastAPI process and would otherwise be unable to bind
+# the ID.
+request_id_ctx: ContextVar[str] = ContextVar('request_id', default='-')
+
+
+def get_request_id() -> str:
+    """Return the current request ID, or ``'-'`` when unbound."""
+    return request_id_ctx.get()
+
+
+def bind_request_id(request_id: str) -> None:
+    """Bind ``request_id`` to both the ContextVar and structlog contextvars.
+
+    Both bindings are needed: the ContextVar drives :func:`get_request_id`
+    callers, and the structlog contextvars binding makes every subsequent
+    ``logger.info(...)`` call on this asyncio task auto-attach
+    ``request_id=<id>`` to its event dict (via the
+    :func:`structlog.contextvars.merge_contextvars` processor wired into
+    the chain below).
+    """
+    request_id_ctx.set(request_id)
+    structlog.contextvars.bind_contextvars(request_id=request_id)
+
+
+def clear_request_id() -> None:
+    """Reset both ContextVar + structlog contextvars to defaults."""
+    request_id_ctx.set('-')
+    structlog.contextvars.unbind_contextvars('request_id')
+
+
 def configure_logging(json_logs: bool = True, log_level: str = 'INFO') -> None:
     """
     Configure structured logging for the application.
@@ -30,25 +65,30 @@ def configure_logging(json_logs: bool = True, log_level: str = 'INFO') -> None:
         level=getattr(logging, log_level.upper()),
     )
 
-    # Processors that modify log entries
-    processors = [
-        # Add log level
+    # Shared pre-chain processors. These run for BOTH structlog calls and
+    # foreign (stdlib logging) records before the final formatter renders.
+    # IMPORTANT: do NOT include ProcessorFormatter.wrap_for_formatter here --
+    # that processor wraps event_dict in a (args, kwargs, event_dict) tuple
+    # and is only valid as the LAST step of a structlog-native chain. Putting
+    # it in foreign_pre_chain triggers `'tuple' object does not support item
+    # deletion` in ProcessorFormatter.format() when stdlib logs (uvicorn,
+    # opensearch-py, etc.) flow through.
+    shared_processors: list[Any] = [
+        # ``merge_contextvars`` MUST be first so request_id (and any
+        # other future contextvar) lands in the event dict before any
+        # other processor inspects it.
+        structlog.contextvars.merge_contextvars,
         structlog.stdlib.add_log_level,
-        # Add logger name
         structlog.stdlib.add_logger_name,
-        # Add timestamp
         structlog.processors.TimeStamper(fmt='iso'),
-        # Add stack info for exceptions
         structlog.processors.StackInfoRenderer(),
-        # Format exceptions
         structlog.processors.format_exc_info,
-        # Unwrap EventDict for stdlib logging
-        structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
     ]
 
-    # Configure structlog
+    # Configure structlog: shared processors, then the wrap_for_formatter
+    # bridge that hands control back to stdlib logging's formatter.
     structlog.configure(
-        processors=processors,
+        processors=[*shared_processors, structlog.stdlib.ProcessorFormatter.wrap_for_formatter],
         wrapper_class=structlog.stdlib.BoundLogger,
         context_class=dict,
         logger_factory=structlog.stdlib.LoggerFactory(),
@@ -59,12 +99,12 @@ def configure_logging(json_logs: bool = True, log_level: str = 'INFO') -> None:
     if json_logs:
         formatter = structlog.stdlib.ProcessorFormatter(
             processor=structlog.processors.JSONRenderer(),
-            foreign_pre_chain=processors,
+            foreign_pre_chain=shared_processors,
         )
     else:
         formatter = structlog.stdlib.ProcessorFormatter(
             processor=structlog.dev.ConsoleRenderer(colors=True),
-            foreign_pre_chain=processors,
+            foreign_pre_chain=shared_processors,
         )
 
     # Apply formatter to root logger
diff --git a/src/main.py b/src/main.py
index 2bcc851..85e1e2d 100644
--- a/src/main.py
+++ b/src/main.py
@@ -17,7 +17,6 @@
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
-from contextvars import ContextVar
 from pathlib import Path
 
 import orjson
@@ -27,7 +26,13 @@
 from src.clients.triton_pool import AsyncTritonPool
 from src.config import get_settings
 from src.core.dependencies import OpenSearchClientFactory, TritonClientFactory
-from src.core.logging import configure_logging, get_logger
+from src.core.logging import (
+    bind_request_id,
+    configure_logging,
+    get_logger,
+    get_request_id,
+    request_id_ctx,
+)
 from src.routers import (
     analyze_router,
     clusters_router,
@@ -45,17 +50,11 @@
 )
 
 
-# =============================================================================
-# Request Context (for correlation IDs)
-# =============================================================================
-
-# Context variable to store current request ID (thread-safe)
-request_id_ctx: ContextVar[str] = ContextVar('request_id', default='-')
-
-
-def get_request_id() -> str:
-    """Get the current request ID from context."""
-    return request_id_ctx.get()
+# Request correlation IDs (request_id_ctx / get_request_id) live in
+# src.core.logging so service-layer modules and out-of-process workers can
+# import them without pulling in the FastAPI app. Re-exported here for
+# backward compatibility.
+__all__ = ['get_request_id', 'request_id_ctx']
 
 
 # =============================================================================
@@ -361,11 +360,12 @@ async def request_id_middleware(request: Request, call_next):
         Add correlation ID (X-Request-ID) to all requests.
 
         If client provides X-Request-ID header, use it. Otherwise generate a new UUID.
-        The request ID is available via get_request_id() in any code path.
+        The request ID is available via get_request_id() in any code path, and
+        bind_request_id() also attaches it to every structlog event on this task.
         """
         # Get or generate request ID
         req_id = request.headers.get('X-Request-ID') or str(uuid.uuid4())[:8]
-        request_id_ctx.set(req_id)
+        bind_request_id(req_id)
 
         # Process request
         response = await call_next(request)

From 8e397988ffa49f34de56678453c0e8771f0035b4 Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:39:33 -0400
Subject: [PATCH 3/8] feat(health): split /live and /ready with real
 per-dependency probes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- /live: pure process liveness — never gated on dependencies. The
  Dockerfile HEALTHCHECK now targets it so a degraded downstream dep
  cannot mark the container unhealthy and cascade through
  depends_on: service_healthy.
- /ready: probes Triton via a real is_server_live() gRPC round-trip
  (pool active_connections is 0 until the first infer and deadlocks
  dependent containers at startup) and OpenSearch via HTTP, each
  bounded to 2s; 503 with per-service detail when any dep is down.
- /health: kept as a backward-compat alias for /ready.
---
 Dockerfile            |   6 +-
 src/routers/health.py | 178 +++++++++++++++++++++++++++++++++---------
 2 files changed, 147 insertions(+), 37 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 84d5b24..7cf66e6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -87,8 +87,12 @@ USER appuser
 
 EXPOSE 8000
 
+# /live = process liveness only. /health now probes dependencies (Triton,
+# OpenSearch) and returns 503 while any is down — gating the container's
+# health on it would mark yolo-api unhealthy (and cascade via
+# depends_on: service_healthy) whenever a *downstream* dep degrades.
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
-    CMD curl -f http://localhost:8000/health || exit 1
+    CMD curl -f http://localhost:8000/live || exit 1
 
 # Production defaults (docker-compose.yml overrides workers, backlog, etc.)
 CMD ["uvicorn", "src.main:app", \
diff --git a/src/routers/health.py b/src/routers/health.py
index 428f2d2..7da5c24 100644
--- a/src/routers/health.py
+++ b/src/routers/health.py
@@ -1,18 +1,26 @@
 """
 Health and Monitoring Router
 
-Provides health checks, service info, and connection pool statistics.
+Provides liveness/readiness probes, service info, and connection pool
+statistics.
+
+- /live   — process responsive, no dependency probing (livenessProbe)
+- /ready  — Triton + OpenSearch actively probed (readinessProbe / LB drain)
+- /health — backward-compat alias for /ready
 """
 
+import asyncio
 import logging
 import os
 from typing import Any
 
+import httpx
 import psutil
 import torch
 from fastapi import APIRouter
+from fastapi.responses import JSONResponse
 
-from src.clients.triton_pool import get_client_pool_stats
+from src.clients.triton_pool import get_async_triton_client, get_client_pool_stats
 from src.config import get_settings
 
 
@@ -23,6 +31,98 @@
 )
 
 
+# Readiness-probe timeout. Kept short so a degraded dep does not stall
+# the probe past a typical k8s readiness-check window.
+_READY_PROBE_TIMEOUT_S = 2.0
+
+
+async def _probe_http(url: str) -> tuple[bool, str]:
+    """Issue a single GET against ``url`` and report (ok, detail).
+
+    Treated as ready on any 2xx/3xx response. Network / timeout / 5xx all
+    flip the service to not-ready; the detail string carries the reason
+    so /ready consumers can see *which* dep is down without grepping logs.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=_READY_PROBE_TIMEOUT_S) as client:
+            response = await client.get(url)
+    except httpx.TimeoutException:
+        return False, f'timeout after {_READY_PROBE_TIMEOUT_S}s'
+    except httpx.HTTPError as exc:
+        return False, f'{type(exc).__name__}: {exc}'
+    if response.status_code >= 500:
+        return False, f'HTTP {response.status_code}'
+    return True, f'HTTP {response.status_code}'
+
+
+async def _probe_triton() -> tuple[bool, str]:
+    """Probe Triton via a real ``is_server_live()`` gRPC round-trip.
+
+    Inspecting ``get_client_pool_stats().active_connections`` is not a
+    valid readiness signal: it stays 0 until the first real infer call
+    lands on the pool. Combined with ``depends_on: service_healthy`` on
+    dependent containers, that produces a startup deadlock — dependents
+    wait for yolo-api to report healthy, but nothing is calling Triton
+    through yolo-api to wake the pool, so the count never moves.
+
+    ``is_server_live()`` actively probes Triton via the async client
+    (created lazily on first call). The side effect is exactly what we
+    want for a readiness check: it both tests reachability AND warms the
+    connection so subsequent infer requests skip the
+    connection-establishment cost. Bounded by ``_READY_PROBE_TIMEOUT_S``
+    so a degraded Triton can't stall the probe past the k8s readiness
+    window.
+    """
+    settings = get_settings()
+    try:
+        client = await asyncio.wait_for(
+            get_async_triton_client(settings.triton_url),
+            timeout=_READY_PROBE_TIMEOUT_S,
+        )
+        live = await asyncio.wait_for(
+            client.is_server_live(),
+            timeout=_READY_PROBE_TIMEOUT_S,
+        )
+    except TimeoutError:
+        return False, f'is_server_live timeout after {_READY_PROBE_TIMEOUT_S}s'
+    except Exception as exc:
+        return False, f'{type(exc).__name__}: {exc}'
+    if not live:
+        return False, 'is_server_live returned False'
+    return True, 'is_server_live OK'
+
+
+async def _readiness_payload() -> tuple[int, dict[str, Any]]:
+    """Probe Triton and OpenSearch concurrently; build the payload.
+
+    Returns the HTTP status code (200 if every dep is ready, 503 otherwise)
+    and a JSON-serializable body listing per-service status + detail.
+    """
+    settings = get_settings()
+    triton_task = asyncio.create_task(_probe_triton())
+    opensearch_task = asyncio.create_task(_probe_http(settings.opensearch_url))
+
+    triton_ok, triton_detail = await triton_task
+    opensearch_ok, opensearch_detail = await opensearch_task
+
+    services = {
+        'triton': {'ok': triton_ok, 'detail': triton_detail, 'url': settings.triton_url},
+        'opensearch': {
+            'ok': opensearch_ok,
+            'detail': opensearch_detail,
+            'url': settings.opensearch_url,
+        },
+    }
+    all_ok = triton_ok and opensearch_ok
+    payload: dict[str, Any] = {
+        'status': 'ready' if all_ok else 'not_ready',
+        'version': settings.api_version,
+        'api_version': 'v1',
+        'services': services,
+    }
+    return (200 if all_ok else 503), payload
+
+
 @router.get('/')
 def root():
     """
@@ -69,53 +169,59 @@ def root():
     }
 
 
-@router.get('/health')
-def health() -> dict[str, Any]:
+@router.get('/live')
+def live() -> dict[str, str]:
+    """Liveness probe — process is responsive, no dependency probing.
+
+    Designed for k8s ``livenessProbe`` and the container HEALTHCHECK: a
+    true response here means the event loop is healthy and the process
+    should NOT be restarted. A degraded dependency (Triton / OpenSearch)
+    must NOT flap liveness; use /ready for that.
     """
-    Health check with service status and performance metrics.
+    return {'status': 'alive'}
+
+
+@router.get('/ready')
+async def ready() -> JSONResponse:
+    """Readiness probe — Triton + OpenSearch both reachable.
 
-    Returns:
-    - Service status
-    - Triton connection status
-    - OpenSearch connection status
-    - GPU memory usage
+    Returns 200 with ``status='ready'`` when every dep responds; 503
+    with per-service detail when at least one is unreachable. Designed
+    for k8s ``readinessProbe`` and for an upstream LB to drain traffic
+    from this instance during dependency degradation without killing
+    the process.
     """
-    settings = get_settings()
+    status_code, payload = await _readiness_payload()
 
-    # Process metrics
+    # Best-effort process / GPU resource snapshot — informational only;
+    # never gates readiness. Kept off /live to keep liveness probes
+    # branch-free.
+    settings = get_settings()
     process = psutil.Process(os.getpid())
     memory_info = process.memory_info()
-
-    health_data: dict[str, Any] = {
-        'status': 'healthy',
-        'version': settings.api_version,
-        'api_version': 'v1',
-        'services': {
-            'triton': {
-                'url': settings.triton_url,
-                'protocol': 'gRPC',
-                'status': 'connected',
-            },
-            'opensearch': {
-                'url': settings.opensearch_url,
-                'status': 'connected',
-            },
-        },
-        'resources': {
-            'memory_mb': round(memory_info.rss / 1024 / 1024, 2),
-            'cpu_percent': process.cpu_percent(),
-        },
+    payload['resources'] = {
+        'memory_mb': round(memory_info.rss / 1024 / 1024, 2),
+        'cpu_percent': process.cpu_percent(),
     }
-
-    # Add GPU metrics if available
     if torch.cuda.is_available():
-        health_data['resources']['gpu'] = {
+        payload['resources']['gpu'] = {
             'name': torch.cuda.get_device_name(0),
             'memory_allocated_mb': round(torch.cuda.memory_allocated() / 1024 / 1024, 2),
             'memory_reserved_mb': round(torch.cuda.memory_reserved() / 1024 / 1024, 2),
         }
+    payload.setdefault('version', settings.api_version)
+    return JSONResponse(status_code=status_code, content=payload)
 
-    return health_data
+
+@router.get('/health')
+async def health() -> JSONResponse:
+    """Backward-compat alias for :func:`ready`.
+
+    Existing dashboards, scripts, and operator runbooks call /health and
+    expect the dep-probing behavior. New callers should target /live
+    (liveness) or /ready (readiness) explicitly.
+    """
+    return await ready()
 
 
 @router.get('/connection_pool_info')

From f23cbf331a1e95813a8eb9ca119f7037cfd15223 Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:42:07 -0400
Subject: [PATCH 4/8] feat(metrics): http_request_duration histogram and
 /metrics endpoint

- src/core/metrics.py: prometheus_client Histogram labeled by method,
  route template, and status; render_metrics() supports the optional
  PROMETHEUS_MULTIPROC_DIR aggregation mode for multi-worker uvicorn.
- http_duration_middleware records every request using the matched
  route template, with a low-cardinality path fallback for 404s.
- GET /metrics exposition endpoint + prometheus scrape job for
  yolo-api.
---
 monitoring/prometheus.yml |  7 +++++++
 pyproject.toml            |  1 +
 requirements.txt          |  3 +++
 src/core/metrics.py       | 44 +++++++++++++++++++++++++++++++++++++++
 src/main.py               | 23 ++++++++++++++++++++
 src/routers/health.py     | 10 ++++++++-
 6 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 src/core/metrics.py

diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
index 0ae2f01..f9c5ea6 100644
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -13,6 +13,13 @@ scrape_configs:
         labels:
           group: 'triton_inference_server'
 
+  - job_name: 'yolo-api'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['yolo-api:8000']
+        labels:
+          group: 'api'
+
   - job_name: 'node'
     static_configs:
       - targets: ['node-exporter:9100']
diff --git a/pyproject.toml b/pyproject.toml
index 32e90f0..bc22bf1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ dependencies = [
     "transformers>=4.30.0",
     "timm",
     "huggingface_hub",
+    "prometheus-client>=0.20",
 ]
 
 [project.optional-dependencies]
diff --git a/requirements.txt b/requirements.txt
index a4d4ff3..0df7831 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -49,6 +49,9 @@ pyyaml
 # Structured Logging
 structlog>=24.1.0  # Structured logging with context and JSON output
 
+# Metrics
+prometheus-client>=0.20  # /metrics exposition + HTTP latency histogram
+
 # OpenSearch for Visual Search
 opensearch-py>=2.3.0  # Async OpenSearch client with k-NN support
 transformers>=4.30.0  # CLIP tokenizer for text-to-image search
diff --git a/src/core/metrics.py b/src/core/metrics.py
new file mode 100644
index 0000000..f160afc
--- /dev/null
+++ b/src/core/metrics.py
@@ -0,0 +1,44 @@
+"""
+Prometheus metrics for the Visual AI API.
+
+Exposes HTTP-level instrumentation via the ``/metrics`` endpoint. Metric
+objects live here (not in routers) so middleware and any future worker
+code can import them without circular imports.
+
+Multi-worker note: with several uvicorn workers each process keeps its
+own registry, so a scrape only sees one worker unless
+``PROMETHEUS_MULTIPROC_DIR`` is set — in that mode prometheus_client
+aggregates across processes via files in that directory (it must exist
+and be writable before workers start).
+"""
+
+import os
+
+from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Histogram, generate_latest
+
+
+# Request latency by route template. Buckets skew low because most
+# endpoints answer in tens of milliseconds; the tail buckets catch
+# batch/ingest calls.
+HTTP_REQUEST_DURATION_SECONDS = Histogram(
+    'http_request_duration_seconds',
+    'HTTP request latency by method, route template, and status code.',
+    labelnames=('method', 'route', 'status'),
+    buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0),
+)
+
+
+def render_metrics() -> tuple[bytes, str]:
+    """Render the metrics exposition payload.
+
+    Returns ``(body, content_type)``. Uses the multiprocess collector
+    when ``PROMETHEUS_MULTIPROC_DIR`` is configured, otherwise the
+    default per-process registry.
+    """
+    if os.environ.get('PROMETHEUS_MULTIPROC_DIR'):
+        from prometheus_client import multiprocess
+
+        registry = CollectorRegistry()
+        multiprocess.MultiProcessCollector(registry)
+        return generate_latest(registry), CONTENT_TYPE_LATEST
+    return generate_latest(), CONTENT_TYPE_LATEST
diff --git a/src/main.py b/src/main.py
index 85e1e2d..b588cef 100644
--- a/src/main.py
+++ b/src/main.py
@@ -353,6 +353,29 @@ async def global_exception_handler(request: Request, exc: Exception):
             headers={'X-Request-ID': req_id},
         )
 
+    @application.middleware('http')
+    async def http_duration_middleware(request: Request, call_next):
+        """Record per-request latency into the Prometheus histogram."""
+        from src.core.metrics import HTTP_REQUEST_DURATION_SECONDS
+
+        started = time.monotonic()
+        response = await call_next(request)
+        # FastAPI populates scope['route'] once a route has matched. For
+        # 404s (no match) fall back to a low-cardinality truncation of
+        # the raw path (first 2 segments) so the label space isn't
+        # exploded by `/v1/whatever/<random-id>` misses.
+        route_obj = request.scope.get('route')
+        route_template = getattr(route_obj, 'path', None)
+        if not route_template:
+            segments = request.url.path.strip('/').split('/')
+            route_template = '/' + '/'.join(segments[:2]) if segments and segments[0] else '/'
+        HTTP_REQUEST_DURATION_SECONDS.labels(
+            method=request.method,
+            route=route_template,
+            status=str(response.status_code),
+        ).observe(time.monotonic() - started)
+        return response
+
     # Request ID Middleware (defined last, runs first in LIFO order)
     @application.middleware('http')
     async def request_id_middleware(request: Request, call_next):
diff --git a/src/routers/health.py b/src/routers/health.py
index 7da5c24..a5d2dcc 100644
--- a/src/routers/health.py
+++ b/src/routers/health.py
@@ -18,10 +18,11 @@
 import psutil
 import torch
 from fastapi import APIRouter
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, Response
 
 from src.clients.triton_pool import get_async_triton_client, get_client_pool_stats
 from src.config import get_settings
+from src.core.metrics import render_metrics
 
 
 logger = logging.getLogger(__name__)
@@ -224,6 +225,13 @@ async def health() -> JSONResponse:
     return await ready()
 
 
+@router.get('/metrics')
+def metrics() -> Response:
+    """Prometheus exposition endpoint (scraped by the monitoring stack)."""
+    body, content_type = render_metrics()
+    return Response(content=body, media_type=content_type)
+
+
 @router.get('/connection_pool_info')
 def connection_pool_info():
     """

From a5045be3d048df0c29c5cc7a9f90bf55623ba558 Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:43:21 -0400
Subject: [PATCH 5/8] perf(clients): raise gRPC message caps to 512MB; harden
 cluster-distance sort

- Triton gRPC send/receive caps 100MB -> 512MB: raw detector heads can
  emit hundreds of MB per max-batch response; 100MB rejected legitimate
  full-batch replies.
- cluster_distance sort gains missing:_last + unmapped_type:double so
  paging a cluster tolerates docs without the field and freshly created
  indices without the mapping (previously a 400 shard failure).
---
 src/clients/opensearch.py  | 10 +++++++++-
 src/clients/triton_pool.py | 12 ++++++++----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/clients/opensearch.py b/src/clients/opensearch.py
index 98c9e3b..97ce9a6 100644
--- a/src/clients/opensearch.py
+++ b/src/clients/opensearch.py
@@ -1790,7 +1790,15 @@ async def get_cluster_members(
             }
 
             if sort_by_distance:
-                query['sort'] = [{'cluster_distance': 'asc'}]
+                query['sort'] = [
+                    {
+                        'cluster_distance': {
+                            'order': 'asc',
+                            'missing': '_last',
+                            'unmapped_type': 'double',
+                        }
+                    }
+                ]
 
             response = await self.client.search(index=index_name.value, body=query)
 
diff --git a/src/clients/triton_pool.py b/src/clients/triton_pool.py
index 28c6419..e164f36 100644
--- a/src/clients/triton_pool.py
+++ b/src/clients/triton_pool.py
@@ -54,10 +54,14 @@
     ('grpc.keepalive_time_ms', 30000),  # Send keepalive ping every 30s
     ('grpc.keepalive_timeout_ms', 10000),  # Wait 10s for keepalive response
     ('grpc.keepalive_permit_without_calls', 1),  # Allow keepalive when idle
-    # Handle large tensors (embeddings, image batches)
-    # 100MB limit handles batch=128 x 512-dim embeddings + overhead
-    ('grpc.max_send_message_length', 100 * 1024 * 1024),  # 100MB
-    ('grpc.max_receive_message_length', 100 * 1024 * 1024),  # 100MB
+    # Handle large tensors (embeddings, image batches, raw detector heads).
+    # The ceiling is dominated by *output* size, not input: a raw
+    # (non-end2end) detector head can emit hundreds of MB for a full
+    # max-batch chunk (e.g. (N, ~100k anchors, 85) FP32). 512 MB fits a
+    # full max_batch response with comfortable headroom; client-side
+    # chunking already keeps inputs well under this.
+    ('grpc.max_send_message_length', 512 * 1024 * 1024),  # 512MB
+    ('grpc.max_receive_message_length', 512 * 1024 * 1024),  # 512MB
     # Connection pooling - allow many concurrent streams per connection
     ('grpc.max_concurrent_streams', 1000),
     # HTTP/2 flow control optimization - relaxed for high-concurrency batch processing

From ea2715e215b87fd348e777b5b93cdfbc6e50b7f2 Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:44:19 -0400
Subject: [PATCH 6/8] perf(triton): raise batch queue delay to 25ms; 3 YOLO
 instances

Under sustained ingest the per-image arrival rate is too slow to fill
preferred-size batches within 5ms, so Triton fired near batch=1. 25ms
reaches batch 8-16 within latency budget; a third YOLO instance
(~1.5 GB) pipelines concurrent ingest batches through more GPU streams.
---
 models/scrfd_10g_bnkps/config.pbtxt           |  4 +++-
 models/yolov11_small_trt_end2end/config.pbtxt | 10 ++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/models/scrfd_10g_bnkps/config.pbtxt b/models/scrfd_10g_bnkps/config.pbtxt
index f056bf6..ee2b34c 100644
--- a/models/scrfd_10g_bnkps/config.pbtxt
+++ b/models/scrfd_10g_bnkps/config.pbtxt
@@ -69,7 +69,9 @@ output [
 
 dynamic_batching {
   preferred_batch_size: [ 8, 16, 32 ]
-  max_queue_delay_microseconds: 5000
+  # 25ms (was 5ms) — same rationale as yolov11: per-image arrival rate
+  # under ingest load is too slow to fill preferred batches in 5ms.
+  max_queue_delay_microseconds: 25000
 }
 
 instance_group [
diff --git a/models/yolov11_small_trt_end2end/config.pbtxt b/models/yolov11_small_trt_end2end/config.pbtxt
index 38a5886..2b0648e 100644
--- a/models/yolov11_small_trt_end2end/config.pbtxt
+++ b/models/yolov11_small_trt_end2end/config.pbtxt
@@ -35,12 +35,18 @@ output [
 
 dynamic_batching {
   preferred_batch_size: [ 8, 16, 32, 64 ]
-  max_queue_delay_microseconds: 5000
+  # 25ms (was 5ms): under sustained ingest the per-image arrival rate is
+  # too slow to assemble preferred-size batches in 5ms, so Triton fired
+  # near batch=1. 25ms reaches batch 8-16 while staying well inside the
+  # request latency budget.
+  max_queue_delay_microseconds: 25000
 }
 
 instance_group [
   {
-    count: 2
+    # 3 instances (~1.5 GB each) let concurrent ingest batches pipeline
+    # through more GPU streams. Drop to 2 on cards under ~8 GB.
+    count: 3
     kind: KIND_GPU
     gpus: [ 0 ]
   }

From d9377087e69617b8be44e58f8c6809535c61f7ee Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:45:43 -0400
Subject: [PATCH 7/8] feat(monitoring): dcgm-exporter all-GPU metrics + GPU
 dashboard

dcgm-exporter publishes per-card utilization, VRAM, power, and
temperature for every host GPU (read-only, no compute reservation);
prometheus scrapes it and the new 'GPU Metrics' Grafana dashboard
renders GPU + host panels.
---
 docker-compose.yml                            |  22 ++++
 .../dashboards/gpu-metrics-dashboard.json     | 111 ++++++++++++++++++
 monitoring/prometheus.yml                     |   6 +
 3 files changed, 139 insertions(+)
 create mode 100644 monitoring/dashboards/gpu-metrics-dashboard.json

diff --git a/docker-compose.yml b/docker-compose.yml
index 2f70041..06dd323 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -147,6 +147,28 @@ services:
     networks:
       - triton_net
 
+  # Per-GPU utilization / VRAM / power / temperature for Grafana across ALL
+  # host GPUs (not just Triton's). Read-only; does not reserve compute.
+  dcgm-exporter:
+    image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.0-ubuntu22.04
+    container_name: triton-dcgm-exporter
+    restart: always
+    cap_add:
+      - SYS_ADMIN
+    environment:
+      - DCGM_EXPORTER_LISTEN=:9400
+    ports:
+      - 4610:9400  # DCGM GPU metrics
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu, utility]
+    networks:
+      - triton_net
+
   prometheus:
     image: prom/prometheus:latest
     container_name: triton-prometheus
diff --git a/monitoring/dashboards/gpu-metrics-dashboard.json b/monitoring/dashboards/gpu-metrics-dashboard.json
new file mode 100644
index 0000000..bd23274
--- /dev/null
+++ b/monitoring/dashboards/gpu-metrics-dashboard.json
@@ -0,0 +1,111 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "refresh": "10s",
+  "schemaVersion": 39,
+  "tags": ["gpu", "system"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "",
+  "title": "GPU Metrics",
+  "uid": "gpu-metrics",
+  "version": 1,
+  "panels": [
+    {
+      "type": "timeseries",
+      "title": "GPU Utilization (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_GPU_UTIL",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "GPU Memory Used (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "decmbytes", "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_FB_USED",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "GPU Power Usage (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "watt", "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_POWER_USAGE",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "GPU Temperature (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "celsius", "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_GPU_TEMP",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Host CPU Utilization",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100)",
+          "legendFormat": "CPU busy %",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Host Memory Used %",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
+          "legendFormat": "memory used %",
+          "refId": "A"
+        }
+      ]
+    }
+  ]
+}
diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
index f9c5ea6..cc5fb54 100644
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@@ -26,6 +26,12 @@ scrape_configs:
         labels:
           group: 'system'
 
+  - job_name: 'dcgm'
+    static_configs:
+      - targets: ['dcgm-exporter:9400']
+        labels:
+          group: 'gpu'
+
   - job_name: 'loki'
     static_configs:
       - targets: ['loki:3100']

From 219f31b8cbedae41876480ca29fac0480db4b46f Mon Sep 17 00:00:00 2001
From: davidamacey <davidamacey@gmail.com>
Date: Fri, 3 Jul 2026 23:49:07 -0400
Subject: [PATCH 8/8] test: pytest scaffolding + integration tests for health,
 metrics, request-id, scrape config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- conftest.py excludes standalone live-deployment scripts from pytest
  collection (they define test_*(name,...) helpers pytest miscollects).
- Health probe tests run against the router with probes monkeypatched —
  no live Triton/OpenSearch needed.
- Metrics + request-id tests exercise the real app middleware wiring.
- Scrape-config test pins the prometheus job topology and validates
  every static target against docker-compose services.
- Synthetic fixture image (generated shapes, 640x480).
---
 pyproject.toml                                |   3 +
 tests/conftest.py                             |  19 +++
 tests/fixtures/sample_image.jpg               | Bin 0 -> 6037 bytes
 tests/integration/__init__.py                 |   0
 tests/integration/test_health_endpoints.py    | 134 ++++++++++++++++++
 tests/integration/test_metrics_exposure.py    |  42 ++++++
 .../test_prometheus_scrape_config.py          |  55 +++++++
 .../test_request_id_propagation.py            |  51 +++++++
 8 files changed, 304 insertions(+)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/fixtures/sample_image.jpg
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/integration/test_health_endpoints.py
 create mode 100644 tests/integration/test_metrics_exposure.py
 create mode 100644 tests/integration/test_prometheus_scrape_config.py
 create mode 100644 tests/integration/test_request_id_propagation.py

diff --git a/pyproject.toml b/pyproject.toml
index bc22bf1..3514fcf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -283,6 +283,9 @@ addopts = [
     "--cov-report=term-missing",
     "--cov-report=html",
 ]
+markers = [
+    "integration: tests that exercise several components together (no live stack required)",
+]
 
 # =============================================================================
 # Coverage Configuration
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..ef8b8dd
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,19 @@
+"""Pytest configuration for the test suite.
+
+Some files under ``tests/`` are standalone smoke-test *scripts* meant to be
+run against a live deployment (``python tests/test_full_system.py``), not
+pytest suites. They define helper functions named ``test_*(name, ...)`` that
+pytest would otherwise miscollect as test cases — failing with
+``fixture 'name' not found``. Exclude them from collection here; run them
+directly per the project README / CLAUDE.md.
+"""
+
+from __future__ import annotations
+
+
+collect_ignore = [
+    'test_full_system.py',
+    'test_scrfd_pipeline.py',
+    'test_validate_models.py',
+    'validate_visual_results.py',
+]
diff --git a/tests/fixtures/sample_image.jpg b/tests/fixtures/sample_image.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cb02faafab3a030240e0130d75eb1f34cf5eb601
GIT binary patch
literal 6037
zcmeH~T}&KR6vxl(%+A8Xvb!ux7YJE_Qm5sCl(tfNU<3r(8pSPD)CV_ht=4KpD%#dq
z2B_M)(U7)Ov^9Y)D&WfoEv1@<p%%sBgCeU?Fkzu!0n?OSKKE;mcd7B~L8&n{P3BI{
zO(rw<&iSAF&;0Ji0d#?GOTp#>peO)}T!2H62MkS`BGfYsqh!@At5mX)Vc{xuRHQa4
zDpI4->SGOBol&pR7~&1a6{fhjIBj%7VuC3#))Z$7O+t~Otdfmj*$9(Pqch!f!%onL
zQ7%O#O(lauPtke`yTDBHR8mU~er=S3CP}Ko!qpMvg*F{1sHN<cN`@hCA0*F#(JP~q
z*5<GV%XU?ASxiRN(bHimxvdwBTYINhuG_J<I$Rw~RK%yQx|3VIX8ne{GB@6xm3MD`
zLE)y&TZ*?m^l-@|k3P2ZiCs@VwfpHkFO<Ld(!Tu%UOx2dYp)+Z@`ml$@!At_*S+)Z
zNqgh_AAH!<-15=KpPXqs+kWoT&pyA{^~IOnUtQ|?uFv`X5B&p|2S-M)jE(<vbz*YL
z?U|XKn_pP;`a-%Wpl`?`{S9G%(WNK46bwT%s*o;9v5y3;XOu~6+2|aLYI~U>Iio5p
zCim#+)(hb&>$XlCckJy|$F5vIlIjjgTT-?>Vb%Xi*>A#r*EI-{G({GV)&mdjc+TE2
z;MLh}9LOQpK=y#_f&a||9Ra(jLpbs^!h=?CF~XT_gkS+e6M{D#;n`PI`3JrEW_O%0
z6@joun6Hu=g+7Ep9m3gc+D;~@S$tERQzD$;{r4bP#7P9{K7=7N)oZP=6b1^czG{S?
z`e1|5JUoklkcx1vrgFj|a*a|-dN4;AFOt6Houxhj;TXbvcV~IYaIFPFUnez)9+Jy$
z^HgS#N4O;Tce=9DuJ>?%N?JRYAk`y`iNV(CMB~DEjL@|Up*cO!f-tD^y@gPO;IT>r
zi%~Ye$HtFlAUIb@+ejYt&8W4d&Cj?LHm>1ThgRh5HYtg(P;>Ume+sfJwlx3#07J&W
ztqxLN&M|6Oa4o~rv@46Vi|sX?xpGPWW~t<o{^u^1ODdQ2w)mG4U<LA|JNiSE8t0`S
z{RnecrT2uf;b6<cxN$m>pe~-EZa6?FIds@JgYW?FDV3bW;bbg7su7%}5&`?Qi@^f~
z>t=5z!XAQj0(P^T6{pDUYJ%Jm={!PTP2dzlTXw+SK@+SWvd$$S)FSwwM|fPA?3V5)
zIIXX+SgGE^K#n8C)gB>uPH^zER}fZ_j}(cVqjIA04+-=A^}aHM<HCH?aBWFvIib4l
z8M24)5MDoL{`s5WMnZMbUwpkMjnIHleM4vu;U&Co*Dg;fAzbkMqI{N;&y%tTWDm$5
PkUb!K;P&<ajhBA`X90me

literal 0
HcmV?d00001

diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/integration/test_health_endpoints.py b/tests/integration/test_health_endpoints.py
new file mode 100644
index 0000000..cb16dee
--- /dev/null
+++ b/tests/integration/test_health_endpoints.py
@@ -0,0 +1,134 @@
+"""/live, /ready, /health endpoint behavior pins.
+
+* ``/live`` must return 200 with ``status='alive'`` regardless of
+  dependency state (kept dependency-free so liveness never flaps on
+  Triton / OpenSearch degradation).
+* ``/ready`` must return 200 only when *every* dep probe succeeds, and
+  503 with per-service detail when at least one fails.
+* ``/health`` is preserved as a backward-compat alias of ``/ready``.
+
+The dep probes are patched out so the tests run without a live
+Triton/OpenSearch stack.
+"""
+
+from __future__ import annotations
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+
+
+pytestmark = pytest.mark.integration
+
+
+def _build_app() -> FastAPI:
+    """Build a minimal FastAPI app that mounts ONLY the health router.
+
+    Avoids pulling in the full src.main lifespan (Triton client pool,
+    OpenSearch warm-up) which a unit-level probe test must not require.
+    """
+    from src.routers.health import router as health_router
+
+    app = FastAPI()
+    app.include_router(health_router)
+    return app
+
+
+@pytest.fixture
+def app() -> FastAPI:
+    return _build_app()
+
+
+@pytest.fixture
+def client(app: FastAPI) -> TestClient:
+    return TestClient(app)
+
+
+def _patch_probes(
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    triton: tuple[bool, str],
+    opensearch: tuple[bool, str],
+) -> None:
+    """Replace each per-service probe with a deterministic stub."""
+    from src.routers import health as health_module
+
+    async def _stub_triton() -> tuple[bool, str]:
+        return triton
+
+    async def _stub_http(url: str) -> tuple[bool, str]:
+        return opensearch
+
+    monkeypatch.setattr(health_module, '_probe_triton', _stub_triton)
+    monkeypatch.setattr(health_module, '_probe_http', _stub_http)
+
+
+def test_live_returns_200_even_when_deps_down(
+    client: TestClient, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """Liveness must NOT be coupled to dependency state."""
+    _patch_probes(
+        monkeypatch,
+        triton=(False, 'is_server_live timeout after 2.0s'),
+        opensearch=(False, 'timeout after 2.0s'),
+    )
+    response = client.get('/live')
+    assert response.status_code == 200
+    assert response.json() == {'status': 'alive'}
+
+
+def test_ready_returns_200_when_all_deps_up(
+    client: TestClient, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    _patch_probes(
+        monkeypatch,
+        triton=(True, 'is_server_live OK'),
+        opensearch=(True, 'HTTP 200'),
+    )
+    response = client.get('/ready')
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload['status'] == 'ready'
+    assert payload['services']['triton']['ok'] is True
+    assert payload['services']['opensearch']['ok'] is True
+
+
+@pytest.mark.parametrize(
+    ('triton_ok', 'opensearch_ok', 'down_service'),
+    [
+        (False, True, 'triton'),
+        (True, False, 'opensearch'),
+        (False, False, 'triton'),
+    ],
+)
+def test_ready_returns_503_with_detail_when_a_dep_is_down(
+    client: TestClient,
+    monkeypatch: pytest.MonkeyPatch,
+    triton_ok: bool,
+    opensearch_ok: bool,
+    down_service: str,
+) -> None:
+    _patch_probes(
+        monkeypatch,
+        triton=(triton_ok, 'is_server_live OK' if triton_ok else 'ConnectError: refused'),
+        opensearch=(opensearch_ok, 'HTTP 200' if opensearch_ok else 'timeout after 2.0s'),
+    )
+    response = client.get('/ready')
+    assert response.status_code == 503
+    payload = response.json()
+    assert payload['status'] == 'not_ready'
+    assert payload['services'][down_service]['ok'] is False
+    # Per-service detail string must carry the failure reason.
+    assert payload['services'][down_service]['detail']
+
+
+def test_health_is_alias_of_ready(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    _patch_probes(
+        monkeypatch,
+        triton=(True, 'is_server_live OK'),
+        opensearch=(False, 'timeout after 2.0s'),
+    )
+    ready = client.get('/ready')
+    health = client.get('/health')
+    assert health.status_code == ready.status_code == 503
+    assert health.json()['services'].keys() == ready.json()['services'].keys()
diff --git a/tests/integration/test_metrics_exposure.py b/tests/integration/test_metrics_exposure.py
new file mode 100644
index 0000000..af428a9
--- /dev/null
+++ b/tests/integration/test_metrics_exposure.py
@@ -0,0 +1,42 @@
+"""/metrics exposition + HTTP duration histogram behavior pins.
+
+Verifies that a request through the real app records into
+``http_request_duration_seconds`` with the matched route template, and
+that ``/metrics`` renders a Prometheus exposition payload containing it.
+"""
+
+from __future__ import annotations
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+pytestmark = pytest.mark.integration
+
+
+@pytest.fixture(scope='module')
+def client() -> TestClient:
+    from src.main import app
+
+    return TestClient(app)
+
+
+def test_request_is_observed_with_route_template(client: TestClient) -> None:
+    assert client.get('/live').status_code == 200
+    body = client.get('/metrics').text
+    assert 'http_request_duration_seconds' in body
+    assert 'route="/live"' in body
+
+
+def test_metrics_content_type_is_prometheus_exposition(client: TestClient) -> None:
+    response = client.get('/metrics')
+    assert response.status_code == 200
+    assert response.headers['content-type'].startswith('text/plain')
+
+
+def test_unmatched_path_uses_low_cardinality_fallback(client: TestClient) -> None:
+    assert client.get('/v1/nonexistent/abc123/deadbeef').status_code == 404
+    body = client.get('/metrics').text
+    # Only the first two path segments may appear — never the random tail.
+    assert 'route="/v1/nonexistent"' in body
+    assert 'abc123' not in body
diff --git a/tests/integration/test_prometheus_scrape_config.py b/tests/integration/test_prometheus_scrape_config.py
new file mode 100644
index 0000000..6dcc4f8
--- /dev/null
+++ b/tests/integration/test_prometheus_scrape_config.py
@@ -0,0 +1,55 @@
+"""Pins for monitoring/prometheus.yml scrape topology.
+
+Guards against scrape jobs silently disappearing (dashboards go blank
+without an error anywhere) and against jobs pointing at services that
+do not exist in docker-compose.yml.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+import yaml
+
+
+pytestmark = pytest.mark.integration
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+@pytest.fixture(scope='module')
+def prometheus_config() -> dict:
+    with (_REPO_ROOT / 'monitoring' / 'prometheus.yml').open() as fh:
+        return yaml.safe_load(fh)
+
+
+@pytest.fixture(scope='module')
+def compose_services() -> set[str]:
+    with (_REPO_ROOT / 'docker-compose.yml').open() as fh:
+        compose = yaml.safe_load(fh)
+    return set(compose.get('services', {}))
+
+
+def _jobs(prometheus_config: dict) -> dict[str, dict]:
+    return {job['job_name']: job for job in prometheus_config['scrape_configs']}
+
+
+def test_expected_scrape_jobs_present(prometheus_config: dict) -> None:
+    jobs = _jobs(prometheus_config)
+    for expected in ('triton', 'yolo-api', 'node', 'dcgm', 'loki'):
+        assert expected in jobs, f'scrape job {expected!r} missing from prometheus.yml'
+
+
+def test_scrape_targets_reference_compose_services(
+    prometheus_config: dict, compose_services: set[str]
+) -> None:
+    """Every static target host must be a service defined in compose."""
+    for job in prometheus_config['scrape_configs']:
+        for static in job.get('static_configs', []):
+            for target in static.get('targets', []):
+                host = target.split(':')[0]
+                assert host in compose_services, (
+                    f'job {job["job_name"]!r} scrapes {host!r}, '
+                    'which is not a docker-compose service'
+                )
diff --git a/tests/integration/test_request_id_propagation.py b/tests/integration/test_request_id_propagation.py
new file mode 100644
index 0000000..da046a1
--- /dev/null
+++ b/tests/integration/test_request_id_propagation.py
@@ -0,0 +1,51 @@
+"""X-Request-ID correlation behavior pins.
+
+A client-supplied ``X-Request-ID`` must be echoed back and bound to the
+logging context for the duration of the request; when absent, the
+middleware must generate one.
+
+Uses the real application object so the middleware wiring itself is under
+test; TestClient is used without its context manager so the lifespan
+(Triton pool, OpenSearch warm-up) never runs.
+"""
+
+from __future__ import annotations
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+pytestmark = pytest.mark.integration
+
+
+@pytest.fixture(scope='module')
+def client() -> TestClient:
+    from src.main import app
+
+    return TestClient(app)
+
+
+def test_client_supplied_request_id_is_echoed(client: TestClient) -> None:
+    response = client.get('/live', headers={'X-Request-ID': 'test-corr-123'})
+    assert response.status_code == 200
+    assert response.headers['X-Request-ID'] == 'test-corr-123'
+
+
+def test_request_id_is_generated_when_absent(client: TestClient) -> None:
+    response = client.get('/live')
+    assert response.status_code == 200
+    generated = response.headers.get('X-Request-ID')
+    assert generated
+    assert generated != '-'
+
+
+def test_bind_request_id_updates_context() -> None:
+    from src.core.logging import bind_request_id, clear_request_id, get_request_id
+
+    assert get_request_id() == '-'
+    bind_request_id('ctx-abc')
+    try:
+        assert get_request_id() == 'ctx-abc'
+    finally:
+        clear_request_id()
+    assert get_request_id() == '-'