davidamacey · davidamacey · Jul 4, 2026 · Jul 4, 2026 · Jul 4, 2026 · Jul 4, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -113,6 +113,23 @@ repos:
         language: system
         files: ^benchmarks/.*\.go$
         types: [go]
+      - id: max-file-size
+        name: Enforce 700 LOC max per source file (ratchet)
+        entry: python3 scripts/codegen/check_file_size.py --max 700
+        language: system
+        files: ^(src|scripts)/.*\.py$
+        # Ratchet: modules already over the cap are grandfathered until
+        # they get split. Do NOT add new entries here — split the module.
+        exclude: |
+          (?x)^(
+              src/ultralytics_patches/|
+              src/clients/opensearch\.py|
+              src/services/visual_search\.py|
+              src/services/face_identity\.py|
+              src/services/duplicate_detection\.py|
+              src/routers/ingest\.py|
+              src/routers/search\.py
+          )
 
   # Commit message linting (conventional commits)
   # Note: Disabled for now - requires separate hook installation with:

diff --git a/Dockerfile b/Dockerfile
@@ -87,8 +87,12 @@ USER appuser
 
 EXPOSE 8000
 
+# /live = process liveness only. /health now probes dependencies (Triton,
+# OpenSearch) and returns 503 while any is down — gating the container's
+# health on it would mark yolo-api unhealthy (and cascade via
+# depends_on: service_healthy) whenever a *downstream* dep degrades.
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
-    CMD curl -f http://localhost:8000/health || exit 1
+    CMD curl -f http://localhost:8000/live || exit 1
 
 # Production defaults (docker-compose.yml overrides workers, backlog, etc.)
 CMD ["uvicorn", "src.main:app", \

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -147,6 +147,28 @@ services:
     networks:
       - triton_net
 
+  # Per-GPU utilization / VRAM / power / temperature for Grafana across ALL
+  # host GPUs (not just Triton's). Read-only; does not reserve compute.
+  dcgm-exporter:
+    image: nvcr.io/nvidia/k8s/dcgm-exporter:3.3.5-3.4.0-ubuntu22.04
+    container_name: triton-dcgm-exporter
+    restart: always
+    cap_add:
+      - SYS_ADMIN
+    environment:
+      - DCGM_EXPORTER_LISTEN=:9400
+    ports:
+      - 4610:9400  # DCGM GPU metrics
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu, utility]
+    networks:
+      - triton_net
+
   prometheus:
     image: prom/prometheus:latest
     container_name: triton-prometheus

diff --git a/models/scrfd_10g_bnkps/config.pbtxt b/models/scrfd_10g_bnkps/config.pbtxt
@@ -69,7 +69,9 @@ output [
 
 dynamic_batching {
   preferred_batch_size: [ 8, 16, 32 ]
-  max_queue_delay_microseconds: 5000
+  # 25ms (was 5ms) — same rationale as yolov11: per-image arrival rate
+  # under ingest load is too slow to fill preferred batches in 5ms.
+  max_queue_delay_microseconds: 25000
 }
 
 instance_group [

diff --git a/models/yolov11_small_trt_end2end/config.pbtxt b/models/yolov11_small_trt_end2end/config.pbtxt
@@ -35,12 +35,18 @@ output [
 
 dynamic_batching {
   preferred_batch_size: [ 8, 16, 32, 64 ]
-  max_queue_delay_microseconds: 5000
+  # 25ms (was 5ms): under sustained ingest the per-image arrival rate is
+  # too slow to assemble preferred-size batches in 5ms, so Triton fired
+  # near batch=1. 25ms reaches batch 8-16 while staying well inside the
+  # request latency budget.
+  max_queue_delay_microseconds: 25000
 }
 
 instance_group [
   {
-    count: 2
+    # 3 instances (~1.5 GB each) let concurrent ingest batches pipeline
+    # through more GPU streams. Drop to 2 on cards under ~8 GB.
+    count: 3
     kind: KIND_GPU
     gpus: [ 0 ]
   }

diff --git a/monitoring/dashboards/gpu-metrics-dashboard.json b/monitoring/dashboards/gpu-metrics-dashboard.json
@@ -0,0 +1,111 @@
+{
+  "annotations": { "list": [] },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "refresh": "10s",
+  "schemaVersion": 39,
+  "tags": ["gpu", "system"],
+  "templating": { "list": [] },
+  "time": { "from": "now-1h", "to": "now" },
+  "timepicker": {},
+  "timezone": "",
+  "title": "GPU Metrics",
+  "uid": "gpu-metrics",
+  "version": 1,
+  "panels": [
+    {
+      "type": "timeseries",
+      "title": "GPU Utilization (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_GPU_UTIL",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "GPU Memory Used (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "fieldConfig": { "defaults": { "unit": "decmbytes", "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_FB_USED",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "GPU Power Usage (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "watt", "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_POWER_USAGE",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "GPU Temperature (per card)",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "fieldConfig": { "defaults": { "unit": "celsius", "min": 0 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "DCGM_FI_DEV_GPU_TEMP",
+          "legendFormat": "GPU {{gpu}} {{modelName}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Host CPU Utilization",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m])) * 100)",
+          "legendFormat": "CPU busy %",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Host Memory Used %",
+      "datasource": "Prometheus",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "fieldConfig": { "defaults": { "unit": "percent", "min": 0, "max": 100 }, "overrides": [] },
+      "targets": [
+        {
+          "datasource": "Prometheus",
+          "expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
+          "legendFormat": "memory used %",
+          "refId": "A"
+        }
+      ]
+    }
+  ]
+}
diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml
@@ -13,12 +13,25 @@ scrape_configs:
         labels:
           group: 'triton_inference_server'
 
+  - job_name: 'yolo-api'
+    metrics_path: /metrics
+    static_configs:
+      - targets: ['yolo-api:8000']
+        labels:
+          group: 'api'
+
   - job_name: 'node'
     static_configs:
       - targets: ['node-exporter:9100']
         labels:
           group: 'system'
 
+  - job_name: 'dcgm'
+    static_configs:
+      - targets: ['dcgm-exporter:9400']
+        labels:
+          group: 'gpu'
+
   - job_name: 'loki'
     static_configs:
       - targets: ['loki:3100']
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,6 +57,7 @@ dependencies = [
     "transformers>=4.30.0",
     "timm",
     "huggingface_hub",
+    "prometheus-client>=0.20",
 ]
 
 [project.optional-dependencies]
@@ -282,6 +283,9 @@ addopts = [
     "--cov-report=term-missing",
     "--cov-report=html",
 ]
+markers = [
+    "integration: tests that exercise several components together (no live stack required)",
+]
 
 # =============================================================================
 # Coverage Configuration

diff --git a/requirements.txt b/requirements.txt
@@ -49,6 +49,9 @@ pyyaml
 # Structured Logging
 structlog>=24.1.0  # Structured logging with context and JSON output
 
+# Metrics
+prometheus-client>=0.20  # /metrics exposition + HTTP latency histogram
+
 # OpenSearch for Visual Search
 opensearch-py>=2.3.0  # Async OpenSearch client with k-NN support
 transformers>=4.30.0  # CLIP tokenizer for text-to-image search

diff --git a/scripts/codegen/check_file_size.py b/scripts/codegen/check_file_size.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Pre-commit regression guard: cap per-file line count.
+
+Prevents source files from regrowing past a threshold so focused-module
+splits don't silently revert. Run by the ``max-file-size`` pre-commit
+hook over ``src/`` and ``scripts/``.
+
+The threshold is set by ``--max`` (default 700). Files listed in the
+hook's ``exclude`` regex are not passed in at all; this script's only
+job is the line-count check.
+
+Exit code 1 if any input file exceeds the cap; the violating files +
+sizes are written to stderr.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+
+def line_count(path: Path) -> int:
+    try:
+        return sum(1 for _ in path.open('rb'))
+    except OSError:
+        return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--max', type=int, default=700, help='Max LOC per file.')
+    parser.add_argument('files', nargs='*', type=Path)
+    args = parser.parse_args()
+
+    violations: list[tuple[Path, int]] = []
+    for p in args.files:
+        n = line_count(p)
+        if n > args.max:
+            violations.append((p, n))
+
+    if violations:
+        sys.stderr.write(
+            f'ERROR: {len(violations)} file(s) exceed the {args.max} LOC ceiling.\n'
+            'Split into focused sub-modules instead of growing monoliths.\n'
+            'Violations:\n'
+        )
+        for p, n in violations:
+            sys.stderr.write(f'  {p.as_posix()}: {n} lines (max {args.max})\n')
+        return 1
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/src/clients/opensearch.py b/src/clients/opensearch.py
@@ -1790,7 +1790,15 @@ async def get_cluster_members(
             }
 
             if sort_by_distance:
-                query['sort'] = [{'cluster_distance': 'asc'}]
+                query['sort'] = [
+                    {
+                        'cluster_distance': {
+                            'order': 'asc',
+                            'missing': '_last',
+                            'unmapped_type': 'double',
+                        }
+                    }
+                ]
 
             response = await self.client.search(index=index_name.value, body=query)
 

diff --git a/src/clients/triton_pool.py b/src/clients/triton_pool.py
@@ -54,10 +54,14 @@
     ('grpc.keepalive_time_ms', 30000),  # Send keepalive ping every 30s
     ('grpc.keepalive_timeout_ms', 10000),  # Wait 10s for keepalive response
     ('grpc.keepalive_permit_without_calls', 1),  # Allow keepalive when idle
-    # Handle large tensors (embeddings, image batches)
-    # 100MB limit handles batch=128 x 512-dim embeddings + overhead
-    ('grpc.max_send_message_length', 100 * 1024 * 1024),  # 100MB
-    ('grpc.max_receive_message_length', 100 * 1024 * 1024),  # 100MB
+    # Handle large tensors (embeddings, image batches, raw detector heads).
+    # The ceiling is dominated by *output* size, not input: a raw
+    # (non-end2end) detector head can emit hundreds of MB for a full
+    # max-batch chunk (e.g. (N, ~100k anchors, 85) FP32). 512 MB fits a
+    # full max_batch response with comfortable headroom; client-side
+    # chunking already keeps inputs well under this.
+    ('grpc.max_send_message_length', 512 * 1024 * 1024),  # 512MB
+    ('grpc.max_receive_message_length', 512 * 1024 * 1024),  # 512MB
     # Connection pooling - allow many concurrent streams per connection
     ('grpc.max_concurrent_streams', 1000),
     # HTTP/2 flow control optimization - relaxed for high-concurrency batch processing