From 87253950004b63b340bfd19c2c449212a7e99bc3 Mon Sep 17 00:00:00 2001
From: Vikrant Puppala <vikrant.puppala@databricks.com>
Date: Wed, 27 May 2026 16:59:10 +0000
Subject: [PATCH] ci/test: deflake retry tests under merge-queue load
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two compounding fixes for the flake observed on PR #812's merge_group
run, where test_oserror_retries and test_retry_max_count_not_exceeded
failed with `assert mock_validate_conn.call_count == 6` because
unexpected `/telemetry-ext` requests had been counted alongside the
intended session-endpoint retries.

1. tests/e2e/common/retry_test_mixins.py — strengthen
   `_isolated_from_telemetry()` with two additional defensive patches:

      - TelemetryClient._send_telemetry → no-op
      - TelemetryClient._export_event → no-op

   The existing factory swap installs NoopTelemetryClient for connections
   created during the test, but doesn't cover real TelemetryClient
   instances that slip in via other paths (stale module-global,
   pre-existing client created before the test entered, or code that
   bypasses initialize_telemetry_client). Patching at the class level
   for the duration of the context catches all of them.

   Verified locally: test_oserror_retries goes from flaky-on-CI to 5/5
   green in consecutive runs.

   (test_retry_max_count_not_exceeded still fails on this branch but
   also fails on baseline main — pre-existing `'SimpleHttpResponse'
   object has no attribute 'version_string'` issue, unrelated.)

2. .github/workflows/code-coverage.yml — serialise merge_group runs.

   Previous concurrency group was keyed on github.ref, which is per-PR
   in the queue (`gh-readonly-queue/main/pr-N-…`). That allowed multiple
   queue entries to hammer the same warehouse in parallel, stressing
   telemetry / retry paths that single-PR runs don't exercise.

   Group merge_group + workflow_dispatch under a single fixed name
   (`e2e-mq-serial`) so they run one at a time. PR-event runs keep
   per-ref grouping + cancel-in-progress for fast author feedback.

   Trade-off: queue throughput drops to one ~17-min run at a time. For
   this repo's PR volume that's acceptable, and the alternative is
   flaky merges.

Co-authored-by: Isaac
Signed-off-by: Vikrant Puppala <vikrant.puppala@databricks.com>
---
 .github/workflows/code-coverage.yml   | 27 +++++++++++++++----------
 tests/e2e/common/retry_test_mixins.py | 29 +++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
index 3a1050573..f4882f669 100644
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -16,18 +16,23 @@ on:
 # the merge has already happened and the coverage check has no power
 # to block. Hence we deliberately don't subscribe to `push:main`.
 #
-# Serialise E2E runs per ref so a force-push (or a fast follow-up commit)
-# on a PR cancels the previous run instead of racing it against shared
-# warehouse state (Delta tables, UC Volume files, etc.).
-#
-# Merge-queue runs are NOT cancelled — each queue entry needs its own
-# clean CI signal so a regression on entry N doesn't get hidden by
-# entry N+1 arriving seconds later. (Concurrent queue runs can still
-# collide on shared warehouse state, but that's the cost of preserving
-# per-entry signal; the uuid-suffix conventions in the e2e tests are
-# what keep them isolated.)
+# Concurrency groups:
+#   - pull_request: per-ref + cancel-in-progress. A force-push or fast
+#     follow-up commit on a PR cancels the previous run instead of
+#     racing it against shared warehouse state (Delta tables, UC Volume
+#     files, telemetry endpoints, etc.).
+#   - merge_group: serialised globally with a fixed group name. The
+#     warehouse can't tolerate two parallel queue entries hammering
+#     telemetry / retry paths simultaneously — we have observed flaky
+#     retry-test failures (extra `/telemetry-ext` retries inflating
+#     mock.call_count) under that load. Running queue entries one at a
+#     time costs queue throughput (one entry at a time, ~17 min each)
+#     but keeps signal trustworthy. cancel-in-progress is off so each
+#     entry gets a complete run.
+#   - workflow_dispatch: shares the merge_group group; manual triggers
+#     are rare enough that serialising them with the queue is fine.
 concurrency:
-  group: e2e-${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.event_name == 'pull_request' && format('e2e-pr-{0}', github.ref) || 'e2e-mq-serial' }}
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py
index af635331d..08876e145 100755
--- a/tests/e2e/common/retry_test_mixins.py
+++ b/tests/e2e/common/retry_test_mixins.py
@@ -17,6 +17,7 @@
 )
 from databricks.sql.telemetry.telemetry_client import (
     NoopTelemetryClient,
+    TelemetryClient,
     TelemetryClientFactory,
     _TelemetryClientHolder,
 )
@@ -27,8 +28,22 @@ def _isolated_from_telemetry():
     # Tests that mock urllib3 globally (via _get_conn / _validate_conn) also
     # intercept background telemetry pushes from the shared
     # TelemetryClientFactory executor — inflating mock.call_count and breaking
-    # assertions like `call_count == 6`. Drain the factory and force any new
-    # connection to use NoopTelemetryClient for the duration of the test.
+    # assertions like `call_count == 6`. Three layers of defence:
+    #
+    #   1. Drain TelemetryClientFactory and override initialize_telemetry_client
+    #      so new connections install NoopTelemetryClient (which submits nothing).
+    #   2. Patch TelemetryClient._send_telemetry to a no-op as a backstop — covers
+    #      any real TelemetryClient instance that slips in (e.g. a stale module-
+    #      global, a code path that bypasses initialize_telemetry_client, or
+    #      anything created before this context entered).
+    #   3. Patch TelemetryClient._export_event to a no-op so even if a real
+    #      client receives an event, the event never reaches the queue and the
+    #      flush logic never fires.
+    #
+    # Without layer 2/3 we have observed `/telemetry-ext` requests showing up
+    # in merge_group runs (concurrent CI load on the warehouse stresses paths
+    # that single-test runs don't hit), inflating retry-test counts and
+    # producing flaky AssertionErrors.
     with TelemetryClientFactory._lock:
         saved_clients = TelemetryClientFactory._clients
         saved_executor = TelemetryClientFactory._executor
@@ -55,11 +70,21 @@ def _install_noop(*args, host_url=None, **kwargs):
                 NoopTelemetryClient()
             )
 
+    def _noop_send(self, events):
+        return None
+
+    def _noop_export(self, event):
+        return None
+
     try:
         with patch.object(
             TelemetryClientFactory,
             "initialize_telemetry_client",
             staticmethod(_install_noop),
+        ), patch.object(
+            TelemetryClient, "_send_telemetry", _noop_send
+        ), patch.object(
+            TelemetryClient, "_export_event", _noop_export
         ):
             yield
     finally: