Skip to content
Empty file.
3,000 changes: 3,000 additions & 0 deletions application/tests/noise_filter/fixtures/candidate_commits.json

Large diffs are not rendered by default.

3,400 changes: 3,400 additions & 0 deletions application/tests/noise_filter/fixtures/labeled_data.json

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions application/tests/noise_filter/fixtures/module_a_mock.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:0", "artifact_id": "art:OWASP/ASVS:4.0/en/0x12-V3-Authentication.md", "pipeline_run_id": "20260201T020000Z", "text": "Authentication should use MFA", "span": {"index": 0, "total": 3, "heading_path": ["Authentication", "JWT"], "start_char_idx": 0, "end_char_idx": 98, "start_line": 10, "end_line": 12}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc123", "committed_at": "2026-02-01T01:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x12-V3-Authentication.md", "path": "4.0/en/0x12-V3-Authentication.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:1", "artifact_id": "art:OWASP/ASVS:4.0/en/0x13-V4-Access-Control.md", "pipeline_run_id": "20260201T020000Z", "text": "Access control should enforce principle of least privilege", "span": {"index": 1, "total": 5, "heading_path": ["Access Control", "Authorization"], "start_char_idx": 120, "end_char_idx": 198, "start_line": 15, "end_line": 18}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc124", "committed_at": "2026-02-01T02:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x13-V4-Access-Control.md", "path": "4.0/en/0x13-V4-Access-Control.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:2", "artifact_id": "art:OWASP/ASVS:4.0/en/0x14-V5-Validation.md", "pipeline_run_id": "20260201T020000Z", "text": "Input validation must be performed on all user-supplied data", "span": {"index": 2, "total": 4, "heading_path": ["Input Validation", "Server-Side Validation"], "start_char_idx": 200, "end_char_idx": 276, "start_line": 22, "end_line": 25}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc125", "committed_at": "2026-02-01T03:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x14-V5-Validation.md", "path": "4.0/en/0x14-V5-Validation.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:3", "artifact_id": "art:OWASP/ASVS:4.0/en/0x15-V6-Encoding.md", "pipeline_run_id": "20260201T020000Z", "text": "Output encoding should be context-aware and properly applied", "span": {"index": 3, "total": 3, "heading_path": ["Output Encoding", "HTML Encoding"], "start_char_idx": 300, "end_char_idx": 375, "start_line": 30, "end_line": 33}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc126", "committed_at": "2026-02-01T04:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x15-V6-Encoding.md", "path": "4.0/en/0x15-V6-Encoding.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:4", "artifact_id": "art:OWASP/ASVS:4.0/en/0x16-V7-Cryptography.md", "pipeline_run_id": "20260201T020000Z", "text": "Use only strong cryptographic algorithms and adequate key lengths", "span": {"index": 4, "total": 6, "heading_path": ["Cryptography", "Algorithm Selection"], "start_char_idx": 400, "end_char_idx": 487, "start_line": 40, "end_line": 44}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc127", "committed_at": "2026-02-01T05:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x16-V7-Cryptography.md", "path": "4.0/en/0x16-V7-Cryptography.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:5", "artifact_id": "art:OWASP/ASVS:4.0/en/0x17-V8-Errors.md", "pipeline_run_id": "20260201T020000Z", "text": "Error handling should not expose sensitive information", "span": {"index": 5, "total": 4, "heading_path": ["Error Handling", "Information Disclosure"], "start_char_idx": 500, "end_char_idx": 568, "start_line": 50, "end_line": 53}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc128", "committed_at": "2026-02-01T06:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x17-V8-Errors.md", "path": "4.0/en/0x17-V8-Errors.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:6", "artifact_id": "art:OWASP/ASVS:4.0/en/0x18-V9-Communications.md", "pipeline_run_id": "20260201T020000Z", "text": "All communications must be encrypted using TLS 1.2 or higher", "span": {"index": 6, "total": 5, "heading_path": ["Communications Security", "Transport Layer"], "start_char_idx": 600, "end_char_idx": 682, "start_line": 60, "end_line": 64}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc129", "committed_at": "2026-02-01T07:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x18-V9-Communications.md", "path": "4.0/en/0x18-V9-Communications.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:7", "artifact_id": "art:OWASP/ASVS:4.0/en/0x19-V10-Malicious.md", "pipeline_run_id": "20260201T020000Z", "text": "Implement protections against malicious code execution", "span": {"index": 7, "total": 3, "heading_path": ["Malicious Code", "Code Injection"], "start_char_idx": 700, "end_char_idx": 768, "start_line": 70, "end_line": 73}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc130", "committed_at": "2026-02-01T08:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x19-V10-Malicious.md", "path": "4.0/en/0x19-V10-Malicious.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:8", "artifact_id": "art:OWASP/ASVS:4.0/en/0x20-V11-Logic.md", "pipeline_run_id": "20260201T020000Z", "text": "Business logic flaws should be identified through security testing", "span": {"index": 8, "total": 4, "heading_path": ["Business Logic", "Workflow Validation"], "start_char_idx": 800, "end_char_idx": 885, "start_line": 80, "end_line": 84}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc131", "committed_at": "2026-02-01T09:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x20-V11-Logic.md", "path": "4.0/en/0x20-V11-Logic.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:9", "artifact_id": "art:OWASP/ASVS:4.0/en/0x21-V12-Files.md", "pipeline_run_id": "20260201T020000Z", "text": "File uploads should be validated and stored securely", "span": {"index": 9, "total": 3, "heading_path": ["File Upload", "Storage Security"], "start_char_idx": 900, "end_char_idx": 967, "start_line": 90, "end_line": 93}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc132", "committed_at": "2026-02-01T10:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x21-V12-Files.md", "path": "4.0/en/0x21-V12-Files.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:10", "artifact_id": "art:OWASP/ASVS:4.0/en/0x22-V13-API.md", "pipeline_run_id": "20260201T020000Z", "text": "API endpoints must enforce authentication and rate limiting", "span": {"index": 10, "total": 4, "heading_path": ["API Security", "Authentication"], "start_char_idx": 1000, "end_char_idx": 1084, "start_line": 100, "end_line": 104}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc133", "committed_at": "2026-02-01T11:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x22-V13-API.md", "path": "4.0/en/0x22-V13-API.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:11", "artifact_id": "art:OWASP/ASVS:4.0/en/0x23-V14-Configuration.md", "pipeline_run_id": "20260201T020000Z", "text": "Configuration management should follow security best practices", "span": {"index": 11, "total": 5, "heading_path": ["Configuration", "Secrets Management"], "start_char_idx": 1100, "end_char_idx": 1181, "start_line": 110, "end_line": 115}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc134", "committed_at": "2026-02-01T12:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x23-V14-Configuration.md", "path": "4.0/en/0x23-V14-Configuration.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:12", "artifact_id": "art:OWASP/ASVS:4.0/en/0x24-V15-Authentication-Advanced.md", "pipeline_run_id": "20260201T020000Z", "text": "Password policies should enforce complexity and history requirements", "span": {"index": 12, "total": 4, "heading_path": ["Advanced Authentication", "Password Management"], "start_char_idx": 1200, "end_char_idx": 1289, "start_line": 120, "end_line": 124}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc135", "committed_at": "2026-02-01T13:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x24-V15-Authentication-Advanced.md", "path": "4.0/en/0x24-V15-Authentication-Advanced.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:13", "artifact_id": "art:OWASP/ASVS:4.0/en/0x25-V16-CSRF.md", "pipeline_run_id": "20260201T020000Z", "text": "CSRF tokens should be generated and validated for all state-changing requests", "span": {"index": 13, "total": 3, "heading_path": ["CSRF Protection", "Token Implementation"], "start_char_idx": 1300, "end_char_idx": 1394, "start_line": 130, "end_line": 133}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc136", "committed_at": "2026-02-01T14:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x25-V16-CSRF.md", "path": "4.0/en/0x25-V16-CSRF.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:14", "artifact_id": "art:OWASP/ASVS:4.0/en/0x26-V17-Session.md", "pipeline_run_id": "20260201T020000Z", "text": "Session management should use secure session tokens and HttpOnly cookies", "span": {"index": 14, "total": 4, "heading_path": ["Session Management", "Cookie Security"], "start_char_idx": 1400, "end_char_idx": 1489, "start_line": 140, "end_line": 144}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc137", "committed_at": "2026-02-01T15:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x26-V17-Session.md", "path": "4.0/en/0x26-V17-Session.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:15", "artifact_id": "art:OWASP/ASVS:4.0/en/0x27-V18-SQL-Injection.md", "pipeline_run_id": "20260201T020000Z", "text": "Parameterized queries must be used to prevent SQL injection attacks", "span": {"index": 15, "total": 3, "heading_path": ["SQL Injection Prevention", "Query Parameterization"], "start_char_idx": 1500, "end_char_idx": 1583, "start_line": 150, "end_line": 153}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc138", "committed_at": "2026-02-01T16:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x27-V18-SQL-Injection.md", "path": "4.0/en/0x27-V18-SQL-Injection.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:16", "artifact_id": "art:OWASP/ASVS:4.0/en/0x28-V19-Deserialization.md", "pipeline_run_id": "20260201T020000Z", "text": "Deserialization should use safe methods and validate all input data", "span": {"index": 16, "total": 3, "heading_path": ["Deserialization", "Object Deserialization"], "start_char_idx": 1600, "end_char_idx": 1680, "start_line": 160, "end_line": 163}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc139", "committed_at": "2026-02-01T17:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x28-V19-Deserialization.md", "path": "4.0/en/0x28-V19-Deserialization.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:17", "artifact_id": "art:OWASP/ASVS:4.0/en/0x29-V20-Dependency.md", "pipeline_run_id": "20260201T020000Z", "text": "Dependencies should be kept up to date and regularly scanned for vulnerabilities", "span": {"index": 17, "total": 4, "heading_path": ["Dependency Management", "Vulnerability Scanning"], "start_char_idx": 1700, "end_char_idx": 1795, "start_line": 170, "end_line": 174}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc140", "committed_at": "2026-02-01T18:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x29-V20-Dependency.md", "path": "4.0/en/0x29-V20-Dependency.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:18", "artifact_id": "art:OWASP/ASVS:4.0/en/0x30-V21-Logging.md", "pipeline_run_id": "20260201T020000Z", "text": "Security events must be logged and monitored for suspicious activity", "span": {"index": 18, "total": 4, "heading_path": ["Logging and Monitoring", "Event Logging"], "start_char_idx": 1800, "end_char_idx": 1885, "start_line": 180, "end_line": 184}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc141", "committed_at": "2026-02-01T19:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x30-V21-Logging.md", "path": "4.0/en/0x30-V21-Logging.md"}}
{"schema_version": "0.2.0", "chunk_id": "chk:art:OWASP/ASVS:…:19", "artifact_id": "art:OWASP/ASVS:4.0/en/0x31-V22-Mobile.md", "pipeline_run_id": "20260201T020000Z", "text": "Mobile applications must implement platform-specific security controls", "span": {"index": 19, "total": 4, "heading_path": ["Mobile Security", "Platform Controls"], "start_char_idx": 1900, "end_char_idx": 1982, "start_line": 190, "end_line": 194}, "source": {"type": "github", "repo": "OWASP/ASVS", "commit_sha": "abc142", "committed_at": "2026-02-01T20:00:00Z"}, "locator": {"kind": "repo_path", "id": "4.0/en/0x31-V22-Mobile.md", "path": "4.0/en/0x31-V22-Mobile.md"}}
Loading
Loading