From aa833b94a6732edf33eba48eb2e0359f3137e939 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Mon, 8 Jun 2026 17:13:50 +0530
Subject: [PATCH 1/3] topic relevance improvements

---
 backend/app/core/validators/llm_utils.py      | 18 ++++-
 .../prompts/topic_relevance_llm/v1.md         | 11 ++-
 .../prompts/topic_relevance_llm/v2.md         | 11 ++-
 .../prompts/topic_relevance_llm/v3.md         | 11 ++-
 .../core/validators/topic_relevance_llm.py    | 45 +++++++++--
 .../validators/test_topic_relevance_llm.py    | 77 ++++++++++++++++++-
 6 files changed, 159 insertions(+), 14 deletions(-)
diff --git a/backend/app/core/validators/llm_utils.py b/backend/app/core/validators/llm_utils.py
index 3db3455..cd1f570 100644
--- a/backend/app/core/validators/llm_utils.py
+++ b/backend/app/core/validators/llm_utils.py
@@ -3,12 +3,28 @@
 # Passed to litellm/OpenAI to force a strict JSON object response.
 JSON_OBJECT_RESPONSE_FORMAT = {"type": "json_object"}
 
+# Models known to support JSON-object response_format that litellm may not list yet.
+_KNOWN_JSON_CAPABLE_MODELS = frozenset(
+    {
+        "gpt-4o-mini",
+        "gpt-4o",
+        "gpt-5-mini",
+        "gpt-5.4-mini",
+        "gpt-5.4-nano",
+        "gpt-5-nano",
+    }
+)
+
 
 def supports_response_format(model: str) -> bool:
     """Return True if the given model supports the OpenAI ``response_format`` param.
 
-    Falls back to False if litellm cannot resolve the model's capabilities.
+    Checks a static allowlist of known-capable models first (covers newly released
+    models that litellm may not enumerate yet), then falls back to litellm.
     """
+    model_id = model.split("/")[-1]  # strip optional provider prefix, e.g. "openai/"
+    if model_id in _KNOWN_JSON_CAPABLE_MODELS:
+        return True
     try:
         return "response_format" in (get_supported_openai_params(model=model) or [])
     except Exception:
diff --git a/backend/app/core/validators/prompts/topic_relevance_llm/v1.md b/backend/app/core/validators/prompts/topic_relevance_llm/v1.md
index ffabc12..a0bd2c8 100644
--- a/backend/app/core/validators/prompts/topic_relevance_llm/v1.md
+++ b/backend/app/core/validators/prompts/topic_relevance_llm/v1.md
@@ -14,4 +14,13 @@ Score using:
 2 = partially related (tangentially related or implicitly within scope)
 1 = clearly outside scope (no relation to any listed topic)
 
-Respond ONLY with a JSON object in this exact format: {"scope_violation": <score>} where <score> is the integer 1, 2, or 3.
+Respond ONLY with a JSON object in this exact format, keeping "interpreted_meaning" and "reasoning" concise (one short sentence each):
+
+{
+  "interpreted_meaning": "<what the query is actually asking for>",
+  "reasoning": "<why this score was assigned>",
+  "scope_violation": <score>,
+  "classification_confidence_score": "<low|medium|high>"
+}
+
+where <score> is the integer 1, 2, or 3.
diff --git a/backend/app/core/validators/prompts/topic_relevance_llm/v2.md b/backend/app/core/validators/prompts/topic_relevance_llm/v2.md
index db56047..e04f111 100644
--- a/backend/app/core/validators/prompts/topic_relevance_llm/v2.md
+++ b/backend/app/core/validators/prompts/topic_relevance_llm/v2.md
@@ -14,4 +14,13 @@ Score using:
 2 = partially related or ambiguous (could be interpreted as related to forbidden, but not explicit)
 1 = clearly forbidden (matches a forbidden topic description)
 
-Respond ONLY with a JSON object in this exact format: {"scope_violation": <score>} where <score> is the integer 1, 2, or 3.
+Respond ONLY with a JSON object in this exact format, keeping "interpreted_meaning" and "reasoning" concise (one short sentence each):
+
+{
+  "interpreted_meaning": "<what the query is actually asking for>",
+  "reasoning": "<why this score was assigned>",
+  "scope_violation": <score>,
+  "classification_confidence_score": "<low|medium|high>"
+}
+
+where <score> is the integer 1, 2, or 3.
diff --git a/backend/app/core/validators/prompts/topic_relevance_llm/v3.md b/backend/app/core/validators/prompts/topic_relevance_llm/v3.md
index 47ba77c..1adca03 100644
--- a/backend/app/core/validators/prompts/topic_relevance_llm/v3.md
+++ b/backend/app/core/validators/prompts/topic_relevance_llm/v3.md
@@ -14,4 +14,13 @@ Score using:
 2 = ambiguous or partially related (uncertain, could plausibly relate to either allowed or forbidden topics, or only tangentially related)
 1 = clearly outside scope (directly matches a forbidden topic description)
 
-Respond ONLY with a JSON object in this exact format: {"scope_violation": <score>} where <score> is the integer 1, 2, or 3.
+Respond ONLY with a JSON object in this exact format, keeping "interpreted_meaning" and "reasoning" concise (one short sentence each):
+
+{
+  "interpreted_meaning": "<what the query is actually asking for>",
+  "reasoning": "<why this score was assigned>",
+  "scope_violation": <score>,
+  "classification_confidence_score": "<low|medium|high>"
+}
+
+where <score> is the integer 1, 2, or 3.
diff --git a/backend/app/core/validators/topic_relevance_llm.py b/backend/app/core/validators/topic_relevance_llm.py
index 69700a1..c2392de 100644
--- a/backend/app/core/validators/topic_relevance_llm.py
+++ b/backend/app/core/validators/topic_relevance_llm.py
@@ -25,10 +25,37 @@
 
 _PROMPTS_DIR = Path(__file__).parent / "prompts" / "topic_relevance_llm"
 
+
+def _extract_first_json_object(text: str) -> dict:
+    """Find and parse the first complete JSON object in *text*.
+
+    Uses brace-depth tracking so it handles values that themselves contain
+    curly braces (e.g. the ``reasoning`` field in the richer response format).
+    """
+    depth = 0
+    start = None
+    for i, ch in enumerate(text):
+        if ch == "{":
+            if depth == 0:
+                start = i
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0 and start is not None:
+                return json.loads(text[start : i + 1])
+    raise ValueError("no JSON object found in response")
+
+
 # Valid scope scores returned by the model; the highest means "clearly in scope".
 _VALID_SCORES = (1, 2, 3)
-# Cap the response: a single ``{"scope_violation": <score>}`` object is tiny.
-_MAX_TOKENS = 50
+# Extra fields the model may return alongside scope_violation.
+_OPTIONAL_FIELDS = (
+    "interpreted_meaning",
+    "reasoning",
+    "classification_confidence_score",
+)
+# Budget for the richer 4-field JSON response.
+_MAX_TOKENS = 300
 
 
 @lru_cache(maxsize=8)
@@ -123,10 +150,7 @@ def _validate(
 
         try:
             text = re.sub(r"```(?:json)?\s*|\s*```", "", content).strip()
-            match = re.search(r"\{[^{}]*\}", text)
-            if not match:
-                raise ValueError("no JSON object found in response")
-            data = json.loads(match.group())
+            data = _extract_first_json_object(text)
             score = data.get("scope_violation")
             # `type(score) is not int` (not isinstance) deliberately rejects bool,
             # which is an int subclass, so `true`/`false` are treated as invalid.
@@ -137,10 +161,15 @@ def _validate(
                 error_message=f"LLM returned an unparseable response: {e}. Raw: {content!r}"
             )
 
+        meta: dict = {"scope_score": score}
+        for field in _OPTIONAL_FIELDS:
+            if field in data:
+                meta[field] = data[field]
+
         if score >= self.threshold:
-            return PassResult(value=value, metadata={"scope_score": score})
+            return PassResult(value=value, metadata=meta)
 
         return FailResult(
             error_message=TOPIC_OUT_OF_SCOPE_ERROR,
-            metadata={"scope_score": score},
+            metadata=meta,
         )
diff --git a/backend/app/tests/validators/test_topic_relevance_llm.py b/backend/app/tests/validators/test_topic_relevance_llm.py
index 926186e..bf7b010 100644
--- a/backend/app/tests/validators/test_topic_relevance_llm.py
+++ b/backend/app/tests/validators/test_topic_relevance_llm.py
@@ -217,6 +217,73 @@ def test_passes_when_response_has_surrounding_prose(validator):
     assert result.metadata["scope_score"] == 2
 
 
+# ---------------------------------------------------------------------------
+# Richer 4-field response format
+# ---------------------------------------------------------------------------
+
+_RICH_PASS = (
+    '{"interpreted_meaning": "How to cook pasta",'
+    ' "reasoning": "Directly about cooking.",'
+    ' "scope_violation": 3,'
+    ' "classification_confidence_score": "high"}'
+)
+
+_RICH_FAIL = (
+    '{"interpreted_meaning": "Latest cricket score",'
+    ' "reasoning": "Unrelated to cooking.",'
+    ' "scope_violation": 1,'
+    ' "classification_confidence_score": "high"}'
+)
+
+
+def test_passes_with_rich_format_and_exposes_extra_metadata(validator):
+    with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
+        mock_llm.return_value = _make_llm_response(_RICH_PASS)
+        result = validator._validate("How do I make pasta?")
+
+    assert isinstance(result, PassResult)
+    assert result.metadata["scope_score"] == 3
+    assert result.metadata["interpreted_meaning"] == "How to cook pasta"
+    assert result.metadata["reasoning"] == "Directly about cooking."
+    assert result.metadata["classification_confidence_score"] == "high"
+
+
+def test_fails_with_rich_format_and_exposes_extra_metadata(validator):
+    with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
+        mock_llm.return_value = _make_llm_response(_RICH_FAIL)
+        result = validator._validate("What is the latest cricket score?")
+
+    assert isinstance(result, FailResult)
+    assert result.metadata["scope_score"] == 1
+    assert result.metadata["interpreted_meaning"] == "Latest cricket score"
+    assert result.metadata["reasoning"] == "Unrelated to cooking."
+    assert result.metadata["classification_confidence_score"] == "high"
+
+
+def test_passes_when_rich_format_wrapped_in_markdown_fence(validator):
+    with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
+        mock_llm.return_value = _make_llm_response(f"```json\n{_RICH_PASS}\n```")
+        result = validator._validate("How do I make pasta?")
+
+    assert isinstance(result, PassResult)
+    assert result.metadata["scope_score"] == 3
+
+
+def test_reasoning_with_curly_braces_is_parsed_correctly(validator):
+    response = (
+        '{"interpreted_meaning": "A cooking query",'
+        ' "reasoning": "Query {clearly} fits cooking scope.",'
+        ' "scope_violation": 3,'
+        ' "classification_confidence_score": "high"}'
+    )
+    with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
+        mock_llm.return_value = _make_llm_response(response)
+        result = validator._validate("How do I make pasta?")
+
+    assert isinstance(result, PassResult)
+    assert result.metadata["scope_score"] == 3
+
+
 def test_fails_when_score_is_boolean(validator):
     with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
         mock_llm.return_value = _make_llm_response('{"scope_violation": true}')
@@ -263,11 +330,14 @@ def test_response_format_passed_when_supported():
 
 
 def test_response_format_omitted_when_not_supported():
+    # Use an unknown model so the static allowlist doesn't short-circuit.
     with patch(
         "app.core.validators.llm_utils.get_supported_openai_params",
         return_value=[],
     ):
-        validator = TopicRelevanceLLM(system_prompt=TOPIC_CONFIG)
+        validator = TopicRelevanceLLM(
+            system_prompt=TOPIC_CONFIG, llm_callable="unknown-model"
+        )
 
     with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
         mock_llm.return_value = _make_llm_response('{"scope_violation": 3}')
@@ -278,11 +348,14 @@ def test_response_format_omitted_when_not_supported():
 
 
 def test_response_format_omitted_when_litellm_check_fails():
+    # Use an unknown model so the static allowlist doesn't short-circuit.
     with patch(
         "app.core.validators.llm_utils.get_supported_openai_params",
         side_effect=Exception("litellm unavailable"),
     ):
-        validator = TopicRelevanceLLM(system_prompt=TOPIC_CONFIG)
+        validator = TopicRelevanceLLM(
+            system_prompt=TOPIC_CONFIG, llm_callable="unknown-model"
+        )
 
     with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
         mock_llm.return_value = _make_llm_response('{"scope_violation": 3}')

From 70cde6367ca142c86fc6e774a598f820d0cd31d1 Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Tue, 9 Jun 2026 13:08:42 +0530
Subject: [PATCH 2/3] resolved comments

---
 .../core/validators/topic_relevance_llm.py    |  27 ++---
 backend/app/tests/pytest.ini                  |   5 +-
 .../validators/test_topic_relevance_llm.py    |   9 ++
 .../test_topic_relevance_llm_live.py          | 107 ++++++++++++++++++
 4 files changed, 127 insertions(+), 21 deletions(-)
 create mode 100644 backend/app/tests/validators/test_topic_relevance_llm_live.py

diff --git a/backend/app/core/validators/topic_relevance_llm.py b/backend/app/core/validators/topic_relevance_llm.py
index c2392de..866cdd6 100644
--- a/backend/app/core/validators/topic_relevance_llm.py
+++ b/backend/app/core/validators/topic_relevance_llm.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import json
-import re
 from functools import lru_cache
 from pathlib import Path
 from typing import Callable, Optional
@@ -27,23 +26,12 @@
 
 
 def _extract_first_json_object(text: str) -> dict:
-    """Find and parse the first complete JSON object in *text*.
-
-    Uses brace-depth tracking so it handles values that themselves contain
-    curly braces (e.g. the ``reasoning`` field in the richer response format).
-    """
-    depth = 0
-    start = None
-    for i, ch in enumerate(text):
-        if ch == "{":
-            if depth == 0:
-                start = i
-            depth += 1
-        elif ch == "}":
-            depth -= 1
-            if depth == 0 and start is not None:
-                return json.loads(text[start : i + 1])
-    raise ValueError("no JSON object found in response")
+    """Find and parse the first complete JSON object in *text*."""
+    start = text.find("{")
+    if start == -1:
+        raise ValueError("no JSON object found in response")
+    obj, _ = json.JSONDecoder().raw_decode(text, start)
+    return obj
 
 
 # Valid scope scores returned by the model; the highest means "clearly in scope".
@@ -149,8 +137,7 @@ def _validate(
             return FailResult(error_message=f"LLM call failed: {e}")
 
         try:
-            text = re.sub(r"```(?:json)?\s*|\s*```", "", content).strip()
-            data = _extract_first_json_object(text)
+            data = _extract_first_json_object(content)
             score = data.get("scope_violation")
             # `type(score) is not int` (not isinstance) deliberately rejects bool,
             # which is an int subclass, so `true`/`false` are treated as invalid.
diff --git a/backend/app/tests/pytest.ini b/backend/app/tests/pytest.ini
index d280de0..abb19b0 100644
--- a/backend/app/tests/pytest.ini
+++ b/backend/app/tests/pytest.ini
@@ -1,2 +1,5 @@
 [pytest]
-asyncio_mode = auto
\ No newline at end of file
+asyncio_mode = auto
+markers =
+    integration: tests that hit the full HTTP stack with a real database
+    llm_live: tests that make real LLM calls (require OPENAI_API_KEY)
\ No newline at end of file
diff --git a/backend/app/tests/validators/test_topic_relevance_llm.py b/backend/app/tests/validators/test_topic_relevance_llm.py
index bf7b010..601ddd2 100644
--- a/backend/app/tests/validators/test_topic_relevance_llm.py
+++ b/backend/app/tests/validators/test_topic_relevance_llm.py
@@ -206,6 +206,15 @@ def test_passes_when_response_wrapped_in_markdown_fence(validator):
     assert result.metadata["scope_score"] == 3
 
 
+def test_passes_when_response_wrapped_in_plain_markdown_fence(validator):
+    with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
+        mock_llm.return_value = _make_llm_response('```\n{"scope_violation": 3}\n```')
+        result = validator._validate("How do I make pasta?")
+
+    assert isinstance(result, PassResult)
+    assert result.metadata["scope_score"] == 3
+
+
 def test_passes_when_response_has_surrounding_prose(validator):
     with patch("app.core.validators.topic_relevance_llm.completion") as mock_llm:
         mock_llm.return_value = _make_llm_response(
diff --git a/backend/app/tests/validators/test_topic_relevance_llm_live.py b/backend/app/tests/validators/test_topic_relevance_llm_live.py
new file mode 100644
index 0000000..5571391
--- /dev/null
+++ b/backend/app/tests/validators/test_topic_relevance_llm_live.py
@@ -0,0 +1,107 @@
+"""
+Live integration tests for TopicRelevanceLLM — these call the real LLM and are
+skipped automatically when OPENAI_API_KEY is not set in the environment.
+
+Run them explicitly with:
+    pytest -m llm_live
+or in any environment that has OPENAI_API_KEY configured.
+"""
+import os
+
+import pytest
+from guardrails.validators import FailResult, PassResult
+
+from app.core.validators.topic_relevance_llm import TopicRelevanceLLM
+
+pytestmark = pytest.mark.llm_live
+
+_needs_key = pytest.mark.skipif(
+    not os.environ.get("OPENAI_API_KEY"),
+    reason="OPENAI_API_KEY not set — skipping live LLM tests",
+)
+
+_COOKING_SCOPE = "Only answer questions about cooking and recipes."
+_HEALTH_SCOPE = "Only answer questions about general health and wellness."
+
+
+@pytest.fixture(scope="module")
+def cooking_validator():
+    return TopicRelevanceLLM(system_prompt=_COOKING_SCOPE)
+
+
+@pytest.fixture(scope="module")
+def health_validator():
+    return TopicRelevanceLLM(system_prompt=_HEALTH_SCOPE)
+
+
+# ---------------------------------------------------------------------------
+# In-scope queries — model should return score >= threshold (PassResult)
+# ---------------------------------------------------------------------------
+
+
+@_needs_key
+def test_live_in_scope_query_passes(cooking_validator):
+    result = cooking_validator._validate("How do I make pasta carbonara?")
+
+    assert isinstance(result, PassResult)
+    assert result.metadata["scope_score"] >= 2
+
+
+@_needs_key
+def test_live_in_scope_query_exposes_score_metadata(cooking_validator):
+    result = cooking_validator._validate("What temperature should I bake bread at?")
+
+    assert isinstance(result, PassResult)
+    assert "scope_score" in result.metadata
+    assert result.metadata["scope_score"] in (1, 2, 3)
+
+
+# ---------------------------------------------------------------------------
+# Out-of-scope queries — model should return score < threshold (FailResult)
+# ---------------------------------------------------------------------------
+
+
+@_needs_key
+def test_live_out_of_scope_query_fails(cooking_validator):
+    result = cooking_validator._validate("What is the capital of France?")
+
+    assert isinstance(result, FailResult)
+    assert "outside the allowed topic scope" in result.error_message
+
+
+@_needs_key
+def test_live_out_of_scope_score_is_exposed_in_metadata(cooking_validator):
+    result = cooking_validator._validate("Who won the cricket World Cup?")
+
+    assert isinstance(result, FailResult)
+    assert "scope_score" in result.metadata
+    assert result.metadata["scope_score"] in (1, 2, 3)
+
+
+# ---------------------------------------------------------------------------
+# JSON response format — exercises _extract_first_json_object on real output
+# ---------------------------------------------------------------------------
+
+
+@_needs_key
+def test_live_response_parsed_without_error(health_validator):
+    """The LLM returns JSON that _extract_first_json_object must parse correctly,
+    regardless of whether the model wraps it in a markdown fence or adds prose."""
+    result = health_validator._validate("How much water should I drink per day?")
+
+    assert isinstance(result, (PassResult, FailResult))
+    assert "scope_score" in result.metadata
+
+
+@_needs_key
+def test_live_different_scope_gives_different_verdict(
+    cooking_validator, health_validator
+):
+    """The same off-topic query fails both validators, confirming scope config is wired."""
+    query = "Explain quantum entanglement."
+
+    cooking_result = cooking_validator._validate(query)
+    health_result = health_validator._validate(query)
+
+    assert isinstance(cooking_result, FailResult)
+    assert isinstance(health_result, FailResult)

From e4ab54def05bd3b42c5411b091c316eaeab66f2a Mon Sep 17 00:00:00 2001
From: rkritika1508 <rkritika1508@gmail.com>
Date: Tue, 9 Jun 2026 13:14:14 +0530
Subject: [PATCH 3/3] fixed ci

---
 .../app/tests/validators/test_topic_relevance_llm_live.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/app/tests/validators/test_topic_relevance_llm_live.py b/backend/app/tests/validators/test_topic_relevance_llm_live.py
index 5571391..622b13d 100644
--- a/backend/app/tests/validators/test_topic_relevance_llm_live.py
+++ b/backend/app/tests/validators/test_topic_relevance_llm_live.py
@@ -1,6 +1,6 @@
 """
 Live integration tests for TopicRelevanceLLM — these call the real LLM and are
-skipped automatically when OPENAI_API_KEY is not set in the environment.
+skipped automatically when OPENAI_API_KEY is not set or is a placeholder value.
 
 Run them explicitly with:
     pytest -m llm_live
@@ -16,8 +16,8 @@
 pytestmark = pytest.mark.llm_live
 
 _needs_key = pytest.mark.skipif(
-    not os.environ.get("OPENAI_API_KEY"),
-    reason="OPENAI_API_KEY not set — skipping live LLM tests",
+    not os.environ.get("OPENAI_API_KEY", "").startswith("sk-"),
+    reason="OPENAI_API_KEY not set or not a valid key — skipping live LLM tests",
 )
 
 _COOKING_SCOPE = "Only answer questions about cooking and recipes."