From e3db1fc377370c14edebf2f9b30659381809a911 Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:35:25 -0400 Subject: [PATCH 1/8] build(pre-commit): add max-file-size ratchet hook (700 LOC cap) New scripts/codegen/check_file_size.py enforces a per-file line ceiling over src/ and scripts/ so module splits don't silently regress into monoliths. Existing oversize modules are grandfathered in the hook's exclude list until they are split. --- .pre-commit-config.yaml | 17 +++++++++ scripts/codegen/check_file_size.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100755 scripts/codegen/check_file_size.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e47d6e0..a753356 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -113,6 +113,23 @@ repos: language: system files: ^benchmarks/.*\.go$ types: [go] + - id: max-file-size + name: Enforce 700 LOC max per source file (ratchet) + entry: python3 scripts/codegen/check_file_size.py --max 700 + language: system + files: ^(src|scripts)/.*\.py$ + # Ratchet: modules already over the cap are grandfathered until + # they get split. Do NOT add new entries here — split the module. + exclude: | + (?x)^( + src/ultralytics_patches/| + src/clients/opensearch\.py| + src/services/visual_search\.py| + src/services/face_identity\.py| + src/services/duplicate_detection\.py| + src/routers/ingest\.py| + src/routers/search\.py + ) # Commit message linting (conventional commits) # Note: Disabled for now - requires separate hook installation with: diff --git a/scripts/codegen/check_file_size.py b/scripts/codegen/check_file_size.py new file mode 100755 index 0000000..fd16354 --- /dev/null +++ b/scripts/codegen/check_file_size.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +"""Pre-commit regression guard: cap per-file line count. + +Prevents source files from regrowing past a threshold so focused-module +splits don't silently revert. Run by the ``max-file-size`` pre-commit +hook over ``src/`` and ``scripts/``. + +The threshold is set by ``--max`` (default 700). Files listed in the +hook's ``exclude`` regex are not passed in at all; this script's only +job is the line-count check. + +Exit code 1 if any input file exceeds the cap; the violating files + +sizes are written to stderr. +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + + +def line_count(path: Path) -> int: + try: + return sum(1 for _ in path.open('rb')) + except OSError: + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--max', type=int, default=700, help='Max LOC per file.') + parser.add_argument('files', nargs='*', type=Path) + args = parser.parse_args() + + violations: list[tuple[Path, int]] = [] + for p in args.files: + n = line_count(p) + if n > args.max: + violations.append((p, n)) + + if violations: + sys.stderr.write( + f'ERROR: {len(violations)} file(s) exceed the {args.max} LOC ceiling.\n' + 'Split into focused sub-modules instead of growing monoliths.\n' + 'Violations:\n' + ) + for p, n in violations: + sys.stderr.write(f' {p.as_posix()}: {n} lines (max {args.max})\n') + return 1 + return 0 + + +if __name__ == '__main__': + sys.exit(main()) From 1117ab839989a1e1adbe1c5071a7aed2fe8f4f0a Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:37:35 -0400 Subject: [PATCH 2/8] refactor(logging): move request-id contextvar to core.logging; fix foreign_pre_chain formatter bug - request_id_ctx / get_request_id now live in src.core.logging (with new bind_request_id / clear_request_id helpers) so service modules and out-of-process workers can import them without pulling in the FastAPI app; src.main re-exports for backward compatibility. - merge_contextvars added first in the processor chain so request_id auto-attaches to every structlog event on the task. - ProcessorFormatter.wrap_for_formatter removed from foreign_pre_chain: it wraps the event dict in a tuple and is only valid as the LAST structlog-native step; in the pre-chain it crashed stdlib log records (uvicorn, opensearch-py) with 'tuple' object does not support item deletion. --- src/core/logging.py | 66 ++++++++++++++++++++++++++++++++++++--------- src/main.py | 30 ++++++++++----------- 2 files changed, 68 insertions(+), 28 deletions(-) diff --git a/src/core/logging.py b/src/core/logging.py index 82074a6..d28de05 100644 --- a/src/core/logging.py +++ b/src/core/logging.py @@ -10,11 +10,46 @@ import logging import sys +from contextvars import ContextVar from typing import Any import structlog +# Request-scoped correlation ID. Set by the HTTP middleware in +# ``src.main`` (FastAPI) and by worker processes per task. Lives here +# (not in src.main) so service-layer modules can import it without +# pulling the FastAPI app at import time — background workers run +# outside the FastAPI process and would otherwise be unable to bind +# the ID. +request_id_ctx: ContextVar[str] = ContextVar('request_id', default='-') + + +def get_request_id() -> str: + """Return the current request ID, or ``'-'`` when unbound.""" + return request_id_ctx.get() + + +def bind_request_id(request_id: str) -> None: + """Bind ``request_id`` to both the ContextVar and structlog contextvars. + + Both bindings are needed: the ContextVar drives :func:`get_request_id` + callers, and the structlog contextvars binding makes every subsequent + ``logger.info(...)`` call on this asyncio task auto-attach + ``request_id=`` to its event dict (via the + :func:`structlog.contextvars.merge_contextvars` processor wired into + the chain below). + """ + request_id_ctx.set(request_id) + structlog.contextvars.bind_contextvars(request_id=request_id) + + +def clear_request_id() -> None: + """Reset both ContextVar + structlog contextvars to defaults.""" + request_id_ctx.set('-') + structlog.contextvars.unbind_contextvars('request_id') + + def configure_logging(json_logs: bool = True, log_level: str = 'INFO') -> None: """ Configure structured logging for the application. @@ -30,25 +65,30 @@ def configure_logging(json_logs: bool = True, log_level: str = 'INFO') -> None: level=getattr(logging, log_level.upper()), ) - # Processors that modify log entries - processors = [ - # Add log level + # Shared pre-chain processors. These run for BOTH structlog calls and + # foreign (stdlib logging) records before the final formatter renders. + # IMPORTANT: do NOT include ProcessorFormatter.wrap_for_formatter here -- + # that processor wraps event_dict in a (args, kwargs, event_dict) tuple + # and is only valid as the LAST step of a structlog-native chain. Putting + # it in foreign_pre_chain triggers `'tuple' object does not support item + # deletion` in ProcessorFormatter.format() when stdlib logs (uvicorn, + # opensearch-py, etc.) flow through. + shared_processors: list[Any] = [ + # ``merge_contextvars`` MUST be first so request_id (and any + # other future contextvar) lands in the event dict before any + # other processor inspects it. + structlog.contextvars.merge_contextvars, structlog.stdlib.add_log_level, - # Add logger name structlog.stdlib.add_logger_name, - # Add timestamp structlog.processors.TimeStamper(fmt='iso'), - # Add stack info for exceptions structlog.processors.StackInfoRenderer(), - # Format exceptions structlog.processors.format_exc_info, - # Unwrap EventDict for stdlib logging - structlog.stdlib.ProcessorFormatter.wrap_for_formatter, ] - # Configure structlog + # Configure structlog: shared processors, then the wrap_for_formatter + # bridge that hands control back to stdlib logging's formatter. structlog.configure( - processors=processors, + processors=[*shared_processors, structlog.stdlib.ProcessorFormatter.wrap_for_formatter], wrapper_class=structlog.stdlib.BoundLogger, context_class=dict, logger_factory=structlog.stdlib.LoggerFactory(), @@ -59,12 +99,12 @@ def configure_logging(json_logs: bool = True, log_level: str = 'INFO') -> None: if json_logs: formatter = structlog.stdlib.ProcessorFormatter( processor=structlog.processors.JSONRenderer(), - foreign_pre_chain=processors, + foreign_pre_chain=shared_processors, ) else: formatter = structlog.stdlib.ProcessorFormatter( processor=structlog.dev.ConsoleRenderer(colors=True), - foreign_pre_chain=processors, + foreign_pre_chain=shared_processors, ) # Apply formatter to root logger diff --git a/src/main.py b/src/main.py index 2bcc851..85e1e2d 100644 --- a/src/main.py +++ b/src/main.py @@ -17,7 +17,6 @@ import uuid from concurrent.futures import ThreadPoolExecutor from contextlib import asynccontextmanager -from contextvars import ContextVar from pathlib import Path import orjson @@ -27,7 +26,13 @@ from src.clients.triton_pool import AsyncTritonPool from src.config import get_settings from src.core.dependencies import OpenSearchClientFactory, TritonClientFactory -from src.core.logging import configure_logging, get_logger +from src.core.logging import ( + bind_request_id, + configure_logging, + get_logger, + get_request_id, + request_id_ctx, +) from src.routers import ( analyze_router, clusters_router, @@ -45,17 +50,11 @@ ) -# ============================================================================= -# Request Context (for correlation IDs) -# ============================================================================= - -# Context variable to store current request ID (thread-safe) -request_id_ctx: ContextVar[str] = ContextVar('request_id', default='-') - - -def get_request_id() -> str: - """Get the current request ID from context.""" - return request_id_ctx.get() +# Request correlation IDs (request_id_ctx / get_request_id) live in +# src.core.logging so service-layer modules and out-of-process workers can +# import them without pulling in the FastAPI app. Re-exported here for +# backward compatibility. +__all__ = ['get_request_id', 'request_id_ctx'] # ============================================================================= @@ -361,11 +360,12 @@ async def request_id_middleware(request: Request, call_next): Add correlation ID (X-Request-ID) to all requests. If client provides X-Request-ID header, use it. Otherwise generate a new UUID. - The request ID is available via get_request_id() in any code path. + The request ID is available via get_request_id() in any code path, and + bind_request_id() also attaches it to every structlog event on this task. """ # Get or generate request ID req_id = request.headers.get('X-Request-ID') or str(uuid.uuid4())[:8] - request_id_ctx.set(req_id) + bind_request_id(req_id) # Process request response = await call_next(request) From 8e397988ffa49f34de56678453c0e8771f0035b4 Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:39:33 -0400 Subject: [PATCH 3/8] feat(health): split /live and /ready with real per-dependency probes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - /live: pure process liveness — never gated on dependencies. The Dockerfile HEALTHCHECK now targets it so a degraded downstream dep cannot mark the container unhealthy and cascade through depends_on: service_healthy. - /ready: probes Triton via a real is_server_live() gRPC round-trip (pool active_connections is 0 until the first infer and deadlocks dependent containers at startup) and OpenSearch via HTTP, each bounded to 2s; 503 with per-service detail when any dep is down. - /health: kept as a backward-compat alias for /ready. --- Dockerfile | 6 +- src/routers/health.py | 178 +++++++++++++++++++++++++++++++++--------- 2 files changed, 147 insertions(+), 37 deletions(-) diff --git a/Dockerfile b/Dockerfile index 84d5b24..7cf66e6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,8 +87,12 @@ USER appuser EXPOSE 8000 +# /live = process liveness only. /health now probes dependencies (Triton, +# OpenSearch) and returns 503 while any is down — gating the container's +# health on it would mark yolo-api unhealthy (and cascade via +# depends_on: service_healthy) whenever a *downstream* dep degrades. HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD curl -f http://localhost:8000/health || exit 1 + CMD curl -f http://localhost:8000/live || exit 1 # Production defaults (docker-compose.yml overrides workers, backlog, etc.) CMD ["uvicorn", "src.main:app", \ diff --git a/src/routers/health.py b/src/routers/health.py index 428f2d2..7da5c24 100644 --- a/src/routers/health.py +++ b/src/routers/health.py @@ -1,18 +1,26 @@ """ Health and Monitoring Router -Provides health checks, service info, and connection pool statistics. +Provides liveness/readiness probes, service info, and connection pool +statistics. + +- /live — process responsive, no dependency probing (livenessProbe) +- /ready — Triton + OpenSearch actively probed (readinessProbe / LB drain) +- /health — backward-compat alias for /ready """ +import asyncio import logging import os from typing import Any +import httpx import psutil import torch from fastapi import APIRouter +from fastapi.responses import JSONResponse -from src.clients.triton_pool import get_client_pool_stats +from src.clients.triton_pool import get_async_triton_client, get_client_pool_stats from src.config import get_settings @@ -23,6 +31,98 @@ ) +# Readiness-probe timeout. Kept short so a degraded dep does not stall +# the probe past a typical k8s readiness-check window. +_READY_PROBE_TIMEOUT_S = 2.0 + + +async def _probe_http(url: str) -> tuple[bool, str]: + """Issue a single GET against ``url`` and report (ok, detail). + + Treated as ready on any 2xx/3xx response. Network / timeout / 5xx all + flip the service to not-ready; the detail string carries the reason + so /ready consumers can see *which* dep is down without grepping logs. + """ + try: + async with httpx.AsyncClient(timeout=_READY_PROBE_TIMEOUT_S) as client: + response = await client.get(url) + except httpx.TimeoutException: + return False, f'timeout after {_READY_PROBE_TIMEOUT_S}s' + except httpx.HTTPError as exc: + return False, f'{type(exc).__name__}: {exc}' + if response.status_code >= 500: + return False, f'HTTP {response.status_code}' + return True, f'HTTP {response.status_code}' + + +async def _probe_triton() -> tuple[bool, str]: + """Probe Triton via a real ``is_server_live()`` gRPC round-trip. + + Inspecting ``get_client_pool_stats().active_connections`` is not a + valid readiness signal: it stays 0 until the first real infer call + lands on the pool. Combined with ``depends_on: service_healthy`` on + dependent containers, that produces a startup deadlock — dependents + wait for yolo-api to report healthy, but nothing is calling Triton + through yolo-api to wake the pool, so the count never moves. + + ``is_server_live()`` actively probes Triton via the async client + (created lazily on first call). The side effect is exactly what we + want for a readiness check: it both tests reachability AND warms the + connection so subsequent infer requests skip the + connection-establishment cost. Bounded by ``_READY_PROBE_TIMEOUT_S`` + so a degraded Triton can't stall the probe past the k8s readiness + window. + """ + settings = get_settings() + try: + client = await asyncio.wait_for( + get_async_triton_client(settings.triton_url), + timeout=_READY_PROBE_TIMEOUT_S, + ) + live = await asyncio.wait_for( + client.is_server_live(), + timeout=_READY_PROBE_TIMEOUT_S, + ) + except TimeoutError: + return False, f'is_server_live timeout after {_READY_PROBE_TIMEOUT_S}s' + except Exception as exc: + return False, f'{type(exc).__name__}: {exc}' + if not live: + return False, 'is_server_live returned False' + return True, 'is_server_live OK' + + +async def _readiness_payload() -> tuple[int, dict[str, Any]]: + """Probe Triton and OpenSearch concurrently; build the payload. + + Returns the HTTP status code (200 if every dep is ready, 503 otherwise) + and a JSON-serializable body listing per-service status + detail. + """ + settings = get_settings() + triton_task = asyncio.create_task(_probe_triton()) + opensearch_task = asyncio.create_task(_probe_http(settings.opensearch_url)) + + triton_ok, triton_detail = await triton_task + opensearch_ok, opensearch_detail = await opensearch_task + + services = { + 'triton': {'ok': triton_ok, 'detail': triton_detail, 'url': settings.triton_url}, + 'opensearch': { + 'ok': opensearch_ok, + 'detail': opensearch_detail, + 'url': settings.opensearch_url, + }, + } + all_ok = triton_ok and opensearch_ok + payload: dict[str, Any] = { + 'status': 'ready' if all_ok else 'not_ready', + 'version': settings.api_version, + 'api_version': 'v1', + 'services': services, + } + return (200 if all_ok else 503), payload + + @router.get('/') def root(): """ @@ -69,53 +169,59 @@ def root(): } -@router.get('/health') -def health() -> dict[str, Any]: +@router.get('/live') +def live() -> dict[str, str]: + """Liveness probe — process is responsive, no dependency probing. + + Designed for k8s ``livenessProbe`` and the container HEALTHCHECK: a + true response here means the event loop is healthy and the process + should NOT be restarted. A degraded dependency (Triton / OpenSearch) + must NOT flap liveness; use /ready for that. """ - Health check with service status and performance metrics. + return {'status': 'alive'} + + +@router.get('/ready') +async def ready() -> JSONResponse: + """Readiness probe — Triton + OpenSearch both reachable. - Returns: - - Service status - - Triton connection status - - OpenSearch connection status - - GPU memory usage + Returns 200 with ``status='ready'`` when every dep responds; 503 + with per-service detail when at least one is unreachable. Designed + for k8s ``readinessProbe`` and for an upstream LB to drain traffic + from this instance during dependency degradation without killing + the process. """ - settings = get_settings() + status_code, payload = await _readiness_payload() - # Process metrics + # Best-effort process / GPU resource snapshot — informational only; + # never gates readiness. Kept off /live to keep liveness probes + # branch-free. + settings = get_settings() process = psutil.Process(os.getpid()) memory_info = process.memory_info() - - health_data: dict[str, Any] = { - 'status': 'healthy', - 'version': settings.api_version, - 'api_version': 'v1', - 'services': { - 'triton': { - 'url': settings.triton_url, - 'protocol': 'gRPC', - 'status': 'connected', - }, - 'opensearch': { - 'url': settings.opensearch_url, - 'status': 'connected', - }, - }, - 'resources': { - 'memory_mb': round(memory_info.rss / 1024 / 1024, 2), - 'cpu_percent': process.cpu_percent(), - }, + payload['resources'] = { + 'memory_mb': round(memory_info.rss / 1024 / 1024, 2), + 'cpu_percent': process.cpu_percent(), } - - # Add GPU metrics if available if torch.cuda.is_available(): - health_data['resources']['gpu'] = { + payload['resources']['gpu'] = { 'name': torch.cuda.get_device_name(0), 'memory_allocated_mb': round(torch.cuda.memory_allocated() / 1024 / 1024, 2), 'memory_reserved_mb': round(torch.cuda.memory_reserved() / 1024 / 1024, 2), } + payload.setdefault('version', settings.api_version) + return JSONResponse(status_code=status_code, content=payload) - return health_data + +@router.get('/health') +async def health() -> JSONResponse: + """Backward-compat alias for :func:`ready`. + + Existing dashboards, scripts, and operator runbooks call /health and + expect the dep-probing behavior. New callers should target /live + (liveness) or /ready (readiness) explicitly. + """ + return await ready() @router.get('/connection_pool_info') From f23cbf331a1e95813a8eb9ca119f7037cfd15223 Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:42:07 -0400 Subject: [PATCH 4/8] feat(metrics): http_request_duration histogram and /metrics endpoint - src/core/metrics.py: prometheus_client Histogram labeled by method, route template, and status; render_metrics() supports the optional PROMETHEUS_MULTIPROC_DIR aggregation mode for multi-worker uvicorn. - http_duration_middleware records every request using the matched route template, with a low-cardinality path fallback for 404s. - GET /metrics exposition endpoint + prometheus scrape job for yolo-api. --- monitoring/prometheus.yml | 7 +++++++ pyproject.toml | 1 + requirements.txt | 3 +++ src/core/metrics.py | 44 +++++++++++++++++++++++++++++++++++++++ src/main.py | 23 ++++++++++++++++++++ src/routers/health.py | 10 ++++++++- 6 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 src/core/metrics.py diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index 0ae2f01..f9c5ea6 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -13,6 +13,13 @@ scrape_configs: labels: group: 'triton_inference_server' + - job_name: 'yolo-api' + metrics_path: /metrics + static_configs: + - targets: ['yolo-api:8000'] + labels: + group: 'api' + - job_name: 'node' static_configs: - targets: ['node-exporter:9100'] diff --git a/pyproject.toml b/pyproject.toml index 32e90f0..bc22bf1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dependencies = [ "transformers>=4.30.0", "timm", "huggingface_hub", + "prometheus-client>=0.20", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index a4d4ff3..0df7831 100644 --- a/requirements.txt +++ b/requirements.txt @@ -49,6 +49,9 @@ pyyaml # Structured Logging structlog>=24.1.0 # Structured logging with context and JSON output +# Metrics +prometheus-client>=0.20 # /metrics exposition + HTTP latency histogram + # OpenSearch for Visual Search opensearch-py>=2.3.0 # Async OpenSearch client with k-NN support transformers>=4.30.0 # CLIP tokenizer for text-to-image search diff --git a/src/core/metrics.py b/src/core/metrics.py new file mode 100644 index 0000000..f160afc --- /dev/null +++ b/src/core/metrics.py @@ -0,0 +1,44 @@ +""" +Prometheus metrics for the Visual AI API. + +Exposes HTTP-level instrumentation via the ``/metrics`` endpoint. Metric +objects live here (not in routers) so middleware and any future worker +code can import them without circular imports. + +Multi-worker note: with several uvicorn workers each process keeps its +own registry, so a scrape only sees one worker unless +``PROMETHEUS_MULTIPROC_DIR`` is set — in that mode prometheus_client +aggregates across processes via files in that directory (it must exist +and be writable before workers start). +""" + +import os + +from prometheus_client import CONTENT_TYPE_LATEST, CollectorRegistry, Histogram, generate_latest + + +# Request latency by route template. Buckets skew low because most +# endpoints answer in tens of milliseconds; the tail buckets catch +# batch/ingest calls. +HTTP_REQUEST_DURATION_SECONDS = Histogram( + 'http_request_duration_seconds', + 'HTTP request latency by method, route template, and status code.', + labelnames=('method', 'route', 'status'), + buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0), +) + + +def render_metrics() -> tuple[bytes, str]: + """Render the metrics exposition payload. + + Returns ``(body, content_type)``. Uses the multiprocess collector + when ``PROMETHEUS_MULTIPROC_DIR`` is configured, otherwise the + default per-process registry. + """ + if os.environ.get('PROMETHEUS_MULTIPROC_DIR'): + from prometheus_client import multiprocess + + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + return generate_latest(registry), CONTENT_TYPE_LATEST + return generate_latest(), CONTENT_TYPE_LATEST diff --git a/src/main.py b/src/main.py index 85e1e2d..b588cef 100644 --- a/src/main.py +++ b/src/main.py @@ -353,6 +353,29 @@ async def global_exception_handler(request: Request, exc: Exception): headers={'X-Request-ID': req_id}, ) + @application.middleware('http') + async def http_duration_middleware(request: Request, call_next): + """Record per-request latency into the Prometheus histogram.""" + from src.core.metrics import HTTP_REQUEST_DURATION_SECONDS + + started = time.monotonic() + response = await call_next(request) + # FastAPI populates scope['route'] once a route has matched. For + # 404s (no match) fall back to a low-cardinality truncation of + # the raw path (first 2 segments) so the label space isn't + # exploded by `/v1/whatever/` misses. + route_obj = request.scope.get('route') + route_template = getattr(route_obj, 'path', None) + if not route_template: + segments = request.url.path.strip('/').split('/') + route_template = '/' + '/'.join(segments[:2]) if segments and segments[0] else '/' + HTTP_REQUEST_DURATION_SECONDS.labels( + method=request.method, + route=route_template, + status=str(response.status_code), + ).observe(time.monotonic() - started) + return response + # Request ID Middleware (defined last, runs first in LIFO order) @application.middleware('http') async def request_id_middleware(request: Request, call_next): diff --git a/src/routers/health.py b/src/routers/health.py index 7da5c24..a5d2dcc 100644 --- a/src/routers/health.py +++ b/src/routers/health.py @@ -18,10 +18,11 @@ import psutil import torch from fastapi import APIRouter -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, Response from src.clients.triton_pool import get_async_triton_client, get_client_pool_stats from src.config import get_settings +from src.core.metrics import render_metrics logger = logging.getLogger(__name__) @@ -224,6 +225,13 @@ async def health() -> JSONResponse: return await ready() +@router.get('/metrics') +def metrics() -> Response: + """Prometheus exposition endpoint (scraped by the monitoring stack).""" + body, content_type = render_metrics() + return Response(content=body, media_type=content_type) + + @router.get('/connection_pool_info') def connection_pool_info(): """ From a5045be3d048df0c29c5cc7a9f90bf55623ba558 Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:43:21 -0400 Subject: [PATCH 5/8] perf(clients): raise gRPC message caps to 512MB; harden cluster-distance sort - Triton gRPC send/receive caps 100MB -> 512MB: raw detector heads can emit hundreds of MB per max-batch response; 100MB rejected legitimate full-batch replies. - cluster_distance sort gains missing:_last + unmapped_type:double so paging a cluster tolerates docs without the field and freshly created indices without the mapping (previously a 400 shard failure). --- src/clients/opensearch.py | 10 +++++++++- src/clients/triton_pool.py | 12 ++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/clients/opensearch.py b/src/clients/opensearch.py index 98c9e3b..97ce9a6 100644 --- a/src/clients/opensearch.py +++ b/src/clients/opensearch.py @@ -1790,7 +1790,15 @@ async def get_cluster_members( } if sort_by_distance: - query['sort'] = [{'cluster_distance': 'asc'}] + query['sort'] = [ + { + 'cluster_distance': { + 'order': 'asc', + 'missing': '_last', + 'unmapped_type': 'double', + } + } + ] response = await self.client.search(index=index_name.value, body=query) diff --git a/src/clients/triton_pool.py b/src/clients/triton_pool.py index 28c6419..e164f36 100644 --- a/src/clients/triton_pool.py +++ b/src/clients/triton_pool.py @@ -54,10 +54,14 @@ ('grpc.keepalive_time_ms', 30000), # Send keepalive ping every 30s ('grpc.keepalive_timeout_ms', 10000), # Wait 10s for keepalive response ('grpc.keepalive_permit_without_calls', 1), # Allow keepalive when idle - # Handle large tensors (embeddings, image batches) - # 100MB limit handles batch=128 x 512-dim embeddings + overhead - ('grpc.max_send_message_length', 100 * 1024 * 1024), # 100MB - ('grpc.max_receive_message_length', 100 * 1024 * 1024), # 100MB + # Handle large tensors (embeddings, image batches, raw detector heads). + # The ceiling is dominated by *output* size, not input: a raw + # (non-end2end) detector head can emit hundreds of MB for a full + # max-batch chunk (e.g. (N, ~100k anchors, 85) FP32). 512 MB fits a + # full max_batch response with comfortable headroom; client-side + # chunking already keeps inputs well under this. + ('grpc.max_send_message_length', 512 * 1024 * 1024), # 512MB + ('grpc.max_receive_message_length', 512 * 1024 * 1024), # 512MB # Connection pooling - allow many concurrent streams per connection ('grpc.max_concurrent_streams', 1000), # HTTP/2 flow control optimization - relaxed for high-concurrency batch processing From ea2715e215b87fd348e777b5b93cdfbc6e50b7f2 Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:44:19 -0400 Subject: [PATCH 6/8] perf(triton): raise batch queue delay to 25ms; 3 YOLO instances Under sustained ingest the per-image arrival rate is too slow to fill preferred-size batches within 5ms, so Triton fired near batch=1. 25ms reaches batch 8-16 within latency budget; a third YOLO instance (~1.5 GB) pipelines concurrent ingest batches through more GPU streams. --- models/scrfd_10g_bnkps/config.pbtxt | 4 +++- models/yolov11_small_trt_end2end/config.pbtxt | 10 ++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/models/scrfd_10g_bnkps/config.pbtxt b/models/scrfd_10g_bnkps/config.pbtxt index f056bf6..ee2b34c 100644 --- a/models/scrfd_10g_bnkps/config.pbtxt +++ b/models/scrfd_10g_bnkps/config.pbtxt @@ -69,7 +69,9 @@ output [ dynamic_batching { preferred_batch_size: [ 8, 16, 32 ] - max_queue_delay_microseconds: 5000 + # 25ms (was 5ms) — same rationale as yolov11: per-image arrival rate + # under ingest load is too slow to fill preferred batches in 5ms. + max_queue_delay_microseconds: 25000 } instance_group [ diff --git a/models/yolov11_small_trt_end2end/config.pbtxt b/models/yolov11_small_trt_end2end/config.pbtxt index 38a5886..2b0648e 100644 --- a/models/yolov11_small_trt_end2end/config.pbtxt +++ b/models/yolov11_small_trt_end2end/config.pbtxt @@ -35,12 +35,18 @@ output [ dynamic_batching { preferred_batch_size: [ 8, 16, 32, 64 ] - max_queue_delay_microseconds: 5000 + # 25ms (was 5ms): under sustained ingest the per-image arrival rate is + # too slow to assemble preferred-size batches in 5ms, so Triton fired + # near batch=1. 25ms reaches batch 8-16 while staying well inside the + # request latency budget. + max_queue_delay_microseconds: 25000 } instance_group [ { - count: 2 + # 3 instances (~1.5 GB each) let concurrent ingest batches pipeline + # through more GPU streams. Drop to 2 on cards under ~8 GB. + count: 3 kind: KIND_GPU gpus: [ 0 ] } From d9377087e69617b8be44e58f8c6809535c61f7ee Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:45:43 -0400 Subject: [PATCH 7/8] feat(monitoring): dcgm-exporter all-GPU metrics + GPU dashboard dcgm-exporter publishes per-card utilization, VRAM, power, and temperature for every host GPU (read-only, no compute reservation); prometheus scrapes it and the new 'GPU Metrics' Grafana dashboard renders GPU + host panels. --- docker-compose.yml | 22 ++++ .../dashboards/gpu-metrics-dashboard.json | 111 ++++++++++++++++++ monitoring/prometheus.yml | 6 + 3 files changed, 139 insertions(+) create mode 100644 monitoring/dashboards/gpu-metrics-dashboard.json diff --git a/docker-compose.yml b/docker-compose.yml index 2f70041..06dd323 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -147,6 +147,28 @@ services: networks: - triton_net + # Per-GPU utilization / VRAM / power / temperature for Grafana across ALL + # host GPUs (not just Triton's). Read-only; does not reserve compute. + dcgm-exporter: + image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.0-ubuntu22.04 + container_name: triton-dcgm-exporter + restart: always + cap_add: + - SYS_ADMIN + environment: + - DCGM_EXPORTER_LISTEN=:9400 + ports: + - 4610:9400 # DCGM GPU metrics + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu, utility] + networks: + - triton_net + prometheus: image: prom/prometheus:latest container_name: triton-prometheus diff --git a/monitoring/dashboards/gpu-metrics-dashboard.json b/monitoring/dashboards/gpu-metrics-dashboard.json new file mode 100644 index 0000000..bd23274 --- /dev/null +++ b/monitoring/dashboards/gpu-metrics-dashboard.json @@ -0,0 +1,111 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "refresh": "10s", + "schemaVersion": 39, + "tags": ["gpu", "system"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "GPU Metrics", + "uid": "gpu-metrics", + "version": 1, + "panels": [ + { + "type": "timeseries", + "title": "GPU Utilization (per card)", + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "DCGM_FI_DEV_GPU_UTIL", + "legendFormat": "GPU {{gpu}} {{modelName}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "GPU Memory Used (per card)", + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "fieldConfig": { "defaults": { "unit": "decmbytes", "min": 0 }, "overrides": [] }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "DCGM_FI_DEV_FB_USED", + "legendFormat": "GPU {{gpu}} {{modelName}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "GPU Power Usage (per card)", + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "watt", "min": 0 }, "overrides": [] }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "DCGM_FI_DEV_POWER_USAGE", + "legendFormat": "GPU {{gpu}} {{modelName}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "GPU Temperature (per card)", + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "fieldConfig": { "defaults": { "unit": "celsius", "min": 0 }, "overrides": [] }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "DCGM_FI_DEV_GPU_TEMP", + "legendFormat": "GPU {{gpu}} {{modelName}}", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "Host CPU Utilization", + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100)", + "legendFormat": "CPU busy %", + "refId": "A" + } + ] + }, + { + "type": "timeseries", + "title": "Host Memory Used %", + "datasource": "Prometheus", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] }, + "targets": [ + { + "datasource": "Prometheus", + "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100", + "legendFormat": "memory used %", + "refId": "A" + } + ] + } + ] +} diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml index f9c5ea6..cc5fb54 100644 --- a/monitoring/prometheus.yml +++ b/monitoring/prometheus.yml @@ -26,6 +26,12 @@ scrape_configs: labels: group: 'system' + - job_name: 'dcgm' + static_configs: + - targets: ['dcgm-exporter:9400'] + labels: + group: 'gpu' + - job_name: 'loki' static_configs: - targets: ['loki:3100'] From 219f31b8cbedae41876480ca29fac0480db4b46f Mon Sep 17 00:00:00 2001 From: davidamacey Date: Fri, 3 Jul 2026 23:49:07 -0400 Subject: [PATCH 8/8] test: pytest scaffolding + integration tests for health, metrics, request-id, scrape config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - conftest.py excludes standalone live-deployment scripts from pytest collection (they define test_*(name,...) helpers pytest miscollects). - Health probe tests run against the router with probes monkeypatched — no live Triton/OpenSearch needed. - Metrics + request-id tests exercise the real app middleware wiring. - Scrape-config test pins the prometheus job topology and validates every static target against docker-compose services. - Synthetic fixture image (generated shapes, 640x480). --- pyproject.toml | 3 + tests/conftest.py | 19 +++ tests/fixtures/sample_image.jpg | Bin 0 -> 6037 bytes tests/integration/__init__.py | 0 tests/integration/test_health_endpoints.py | 134 ++++++++++++++++++ tests/integration/test_metrics_exposure.py | 42 ++++++ .../test_prometheus_scrape_config.py | 55 +++++++ .../test_request_id_propagation.py | 51 +++++++ 8 files changed, 304 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/sample_image.jpg create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/test_health_endpoints.py create mode 100644 tests/integration/test_metrics_exposure.py create mode 100644 tests/integration/test_prometheus_scrape_config.py create mode 100644 tests/integration/test_request_id_propagation.py diff --git a/pyproject.toml b/pyproject.toml index bc22bf1..3514fcf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -283,6 +283,9 @@ addopts = [ "--cov-report=term-missing", "--cov-report=html", ] +markers = [ + "integration: tests that exercise several components together (no live stack required)", +] # ============================================================================= # Coverage Configuration diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..ef8b8dd --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,19 @@ +"""Pytest configuration for the test suite. + +Some files under ``tests/`` are standalone smoke-test *scripts* meant to be +run against a live deployment (``python tests/test_full_system.py``), not +pytest suites. They define helper functions named ``test_*(name, ...)`` that +pytest would otherwise miscollect as test cases — failing with +``fixture 'name' not found``. Exclude them from collection here; run them +directly per the project README / CLAUDE.md. +""" + +from __future__ import annotations + + +collect_ignore = [ + 'test_full_system.py', + 'test_scrfd_pipeline.py', + 'test_validate_models.py', + 'validate_visual_results.py', +] diff --git a/tests/fixtures/sample_image.jpg b/tests/fixtures/sample_image.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cb02faafab3a030240e0130d75eb1f34cf5eb601 GIT binary patch literal 6037 zcmeH~T}&KR6vxl(%+A8Xvb!ux7YJE_Qm5sCl(tfNU<3r(8pSPD)CV_ht=4KpD%#dq z2B_M)(U7)Ov^9Y)D&WfoEv1@peO)}T!2H62MkS`BGfYsqh!@At5mX)Vc{xuRHQa4 zDpI4->SGOBol&pR7~&1a6{fhjIBj%7VuC3#))Z$7O+t~Otdfmj*$9(Pqch!f!%onL zQ7%O#O(lauPtke`yTDBHR8mU~er=S3CP}Ko!qpMvg*F{1sHNIio5p zCim#+)(hb&>$XlCckJy|$F5vIlIjjgTT-?>Vb%Xi*>A#r*EI-{G({GV)&mdjc+TE2 z;MLh}9LOQpK=y#_f&a||9Ra(jLpbs^!h=?CF~XT_gkS+e6M{D#;n`PI`3JrEW_O%0 z6@joun6Hu=g+7Ep9m3gc+D;~@S$tERQzD$;{r4bP#7P9{K7=7N)oZP=6b1^czG{S? z`e1|5JUoklkcx1vrgFj|a*a|-dN4;AFOt6Houxhj;TXbvcV~IYaIFPFUnez)9+Jy$ z^HgS#N4O;Tce=9DuJ>?%N?JRYAk`y`iNV(CMB~DEjL@|Up*cO!f-tD^y@gPO;IT>r zi%~Ye$HtFlAUIb@+ejYt&8W4d&Cj?LHm>1ThgRh5HYtg(P;>Ume+sfJwlx3#07J&W ztqxLN&M|6Oa4o~rv@46Vi|sX?xpGPWW~tpe~-EZa6?FIds@JgYW?FDV3bW;bbg7su7%}5&`?Qi@^f~ z>t=5z!XAQj0(P^T6{pDUYJ%Jm={!PTP2dzlTXw+SK@+SWvd$$S)FSwwM|fPA?3V5) zIIXX+SgGE^K#n8C)gB>uPH^zER}fZ_j}(cVqjIA04+-=A^}aHM FastAPI: + """Build a minimal FastAPI app that mounts ONLY the health router. + + Avoids pulling in the full src.main lifespan (Triton client pool, + OpenSearch warm-up) which a unit-level probe test must not require. + """ + from src.routers.health import router as health_router + + app = FastAPI() + app.include_router(health_router) + return app + + +@pytest.fixture +def app() -> FastAPI: + return _build_app() + + +@pytest.fixture +def client(app: FastAPI) -> TestClient: + return TestClient(app) + + +def _patch_probes( + monkeypatch: pytest.MonkeyPatch, + *, + triton: tuple[bool, str], + opensearch: tuple[bool, str], +) -> None: + """Replace each per-service probe with a deterministic stub.""" + from src.routers import health as health_module + + async def _stub_triton() -> tuple[bool, str]: + return triton + + async def _stub_http(url: str) -> tuple[bool, str]: + return opensearch + + monkeypatch.setattr(health_module, '_probe_triton', _stub_triton) + monkeypatch.setattr(health_module, '_probe_http', _stub_http) + + +def test_live_returns_200_even_when_deps_down( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + """Liveness must NOT be coupled to dependency state.""" + _patch_probes( + monkeypatch, + triton=(False, 'is_server_live timeout after 2.0s'), + opensearch=(False, 'timeout after 2.0s'), + ) + response = client.get('/live') + assert response.status_code == 200 + assert response.json() == {'status': 'alive'} + + +def test_ready_returns_200_when_all_deps_up( + client: TestClient, monkeypatch: pytest.MonkeyPatch +) -> None: + _patch_probes( + monkeypatch, + triton=(True, 'is_server_live OK'), + opensearch=(True, 'HTTP 200'), + ) + response = client.get('/ready') + assert response.status_code == 200 + payload = response.json() + assert payload['status'] == 'ready' + assert payload['services']['triton']['ok'] is True + assert payload['services']['opensearch']['ok'] is True + + +@pytest.mark.parametrize( + ('triton_ok', 'opensearch_ok', 'down_service'), + [ + (False, True, 'triton'), + (True, False, 'opensearch'), + (False, False, 'triton'), + ], +) +def test_ready_returns_503_with_detail_when_a_dep_is_down( + client: TestClient, + monkeypatch: pytest.MonkeyPatch, + triton_ok: bool, + opensearch_ok: bool, + down_service: str, +) -> None: + _patch_probes( + monkeypatch, + triton=(triton_ok, 'is_server_live OK' if triton_ok else 'ConnectError: refused'), + opensearch=(opensearch_ok, 'HTTP 200' if opensearch_ok else 'timeout after 2.0s'), + ) + response = client.get('/ready') + assert response.status_code == 503 + payload = response.json() + assert payload['status'] == 'not_ready' + assert payload['services'][down_service]['ok'] is False + # Per-service detail string must carry the failure reason. + assert payload['services'][down_service]['detail'] + + +def test_health_is_alias_of_ready(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None: + _patch_probes( + monkeypatch, + triton=(True, 'is_server_live OK'), + opensearch=(False, 'timeout after 2.0s'), + ) + ready = client.get('/ready') + health = client.get('/health') + assert health.status_code == ready.status_code == 503 + assert health.json()['services'].keys() == ready.json()['services'].keys() diff --git a/tests/integration/test_metrics_exposure.py b/tests/integration/test_metrics_exposure.py new file mode 100644 index 0000000..af428a9 --- /dev/null +++ b/tests/integration/test_metrics_exposure.py @@ -0,0 +1,42 @@ +"""/metrics exposition + HTTP duration histogram behavior pins. + +Verifies that a request through the real app records into +``http_request_duration_seconds`` with the matched route template, and +that ``/metrics`` renders a Prometheus exposition payload containing it. +""" + +from __future__ import annotations + +import pytest +from fastapi.testclient import TestClient + + +pytestmark = pytest.mark.integration + + +@pytest.fixture(scope='module') +def client() -> TestClient: + from src.main import app + + return TestClient(app) + + +def test_request_is_observed_with_route_template(client: TestClient) -> None: + assert client.get('/live').status_code == 200 + body = client.get('/metrics').text + assert 'http_request_duration_seconds' in body + assert 'route="/live"' in body + + +def test_metrics_content_type_is_prometheus_exposition(client: TestClient) -> None: + response = client.get('/metrics') + assert response.status_code == 200 + assert response.headers['content-type'].startswith('text/plain') + + +def test_unmatched_path_uses_low_cardinality_fallback(client: TestClient) -> None: + assert client.get('/v1/nonexistent/abc123/deadbeef').status_code == 404 + body = client.get('/metrics').text + # Only the first two path segments may appear — never the random tail. + assert 'route="/v1/nonexistent"' in body + assert 'abc123' not in body diff --git a/tests/integration/test_prometheus_scrape_config.py b/tests/integration/test_prometheus_scrape_config.py new file mode 100644 index 0000000..6dcc4f8 --- /dev/null +++ b/tests/integration/test_prometheus_scrape_config.py @@ -0,0 +1,55 @@ +"""Pins for monitoring/prometheus.yml scrape topology. + +Guards against scrape jobs silently disappearing (dashboards go blank +without an error anywhere) and against jobs pointing at services that +do not exist in docker-compose.yml. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + + +pytestmark = pytest.mark.integration + +_REPO_ROOT = Path(__file__).resolve().parents[2] + + +@pytest.fixture(scope='module') +def prometheus_config() -> dict: + with (_REPO_ROOT / 'monitoring' / 'prometheus.yml').open() as fh: + return yaml.safe_load(fh) + + +@pytest.fixture(scope='module') +def compose_services() -> set[str]: + with (_REPO_ROOT / 'docker-compose.yml').open() as fh: + compose = yaml.safe_load(fh) + return set(compose.get('services', {})) + + +def _jobs(prometheus_config: dict) -> dict[str, dict]: + return {job['job_name']: job for job in prometheus_config['scrape_configs']} + + +def test_expected_scrape_jobs_present(prometheus_config: dict) -> None: + jobs = _jobs(prometheus_config) + for expected in ('triton', 'yolo-api', 'node', 'dcgm', 'loki'): + assert expected in jobs, f'scrape job {expected!r} missing from prometheus.yml' + + +def test_scrape_targets_reference_compose_services( + prometheus_config: dict, compose_services: set[str] +) -> None: + """Every static target host must be a service defined in compose.""" + for job in prometheus_config['scrape_configs']: + for static in job.get('static_configs', []): + for target in static.get('targets', []): + host = target.split(':')[0] + assert host in compose_services, ( + f'job {job["job_name"]!r} scrapes {host!r}, ' + 'which is not a docker-compose service' + ) diff --git a/tests/integration/test_request_id_propagation.py b/tests/integration/test_request_id_propagation.py new file mode 100644 index 0000000..da046a1 --- /dev/null +++ b/tests/integration/test_request_id_propagation.py @@ -0,0 +1,51 @@ +"""X-Request-ID correlation behavior pins. + +A client-supplied ``X-Request-ID`` must be echoed back and bound to the +logging context for the duration of the request; when absent, the +middleware must generate one. + +Uses the real application object so the middleware wiring itself is under +test; TestClient is used without its context manager so the lifespan +(Triton pool, OpenSearch warm-up) never runs. +""" + +from __future__ import annotations + +import pytest +from fastapi.testclient import TestClient + + +pytestmark = pytest.mark.integration + + +@pytest.fixture(scope='module') +def client() -> TestClient: + from src.main import app + + return TestClient(app) + + +def test_client_supplied_request_id_is_echoed(client: TestClient) -> None: + response = client.get('/live', headers={'X-Request-ID': 'test-corr-123'}) + assert response.status_code == 200 + assert response.headers['X-Request-ID'] == 'test-corr-123' + + +def test_request_id_is_generated_when_absent(client: TestClient) -> None: + response = client.get('/live') + assert response.status_code == 200 + generated = response.headers.get('X-Request-ID') + assert generated + assert generated != '-' + + +def test_bind_request_id_updates_context() -> None: + from src.core.logging import bind_request_id, clear_request_id, get_request_id + + assert get_request_id() == '-' + bind_request_id('ctx-abc') + try: + assert get_request_id() == 'ctx-abc' + finally: + clear_request_id() + assert get_request_id() == '-'