From 87253950004b63b340bfd19c2c449212a7e99bc3 Mon Sep 17 00:00:00 2001 From: Vikrant Puppala Date: Wed, 27 May 2026 16:59:10 +0000 Subject: [PATCH] ci/test: deflake retry tests under merge-queue load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two compounding fixes for the flake observed on PR #812's merge_group run, where test_oserror_retries and test_retry_max_count_not_exceeded failed with `assert mock_validate_conn.call_count == 6` because unexpected `/telemetry-ext` requests had been counted alongside the intended session-endpoint retries. 1. tests/e2e/common/retry_test_mixins.py — strengthen `_isolated_from_telemetry()` with two additional defensive patches: - TelemetryClient._send_telemetry → no-op - TelemetryClient._export_event → no-op The existing factory swap installs NoopTelemetryClient for connections created during the test, but doesn't cover real TelemetryClient instances that slip in via other paths (stale module-global, pre-existing client created before the test entered, or code that bypasses initialize_telemetry_client). Patching at the class level for the duration of the context catches all of them. Verified locally: test_oserror_retries goes from flaky-on-CI to 5/5 green in consecutive runs. (test_retry_max_count_not_exceeded still fails on this branch but also fails on baseline main — pre-existing `'SimpleHttpResponse' object has no attribute 'version_string'` issue, unrelated.) 2. .github/workflows/code-coverage.yml — serialise merge_group runs. Previous concurrency group was keyed on github.ref, which is per-PR in the queue (`gh-readonly-queue/main/pr-N-…`). That allowed multiple queue entries to hammer the same warehouse in parallel, stressing telemetry / retry paths that single-PR runs don't exercise. Group merge_group + workflow_dispatch under a single fixed name (`e2e-mq-serial`) so they run one at a time. PR-event runs keep per-ref grouping + cancel-in-progress for fast author feedback. Trade-off: queue throughput drops to one ~17-min run at a time. For this repo's PR volume that's acceptable, and the alternative is flaky merges. Co-authored-by: Isaac Signed-off-by: Vikrant Puppala --- .github/workflows/code-coverage.yml | 27 +++++++++++++++---------- tests/e2e/common/retry_test_mixins.py | 29 +++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 3a1050573..f4882f669 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -16,18 +16,23 @@ on: # the merge has already happened and the coverage check has no power # to block. Hence we deliberately don't subscribe to `push:main`. # -# Serialise E2E runs per ref so a force-push (or a fast follow-up commit) -# on a PR cancels the previous run instead of racing it against shared -# warehouse state (Delta tables, UC Volume files, etc.). -# -# Merge-queue runs are NOT cancelled — each queue entry needs its own -# clean CI signal so a regression on entry N doesn't get hidden by -# entry N+1 arriving seconds later. (Concurrent queue runs can still -# collide on shared warehouse state, but that's the cost of preserving -# per-entry signal; the uuid-suffix conventions in the e2e tests are -# what keep them isolated.) +# Concurrency groups: +# - pull_request: per-ref + cancel-in-progress. A force-push or fast +# follow-up commit on a PR cancels the previous run instead of +# racing it against shared warehouse state (Delta tables, UC Volume +# files, telemetry endpoints, etc.). +# - merge_group: serialised globally with a fixed group name. The +# warehouse can't tolerate two parallel queue entries hammering +# telemetry / retry paths simultaneously — we have observed flaky +# retry-test failures (extra `/telemetry-ext` retries inflating +# mock.call_count) under that load. Running queue entries one at a +# time costs queue throughput (one entry at a time, ~17 min each) +# but keeps signal trustworthy. cancel-in-progress is off so each +# entry gets a complete run. +# - workflow_dispatch: shares the merge_group group; manual triggers +# are rare enough that serialising them with the queue is fine. concurrency: - group: e2e-${{ github.workflow }}-${{ github.ref }} + group: ${{ github.event_name == 'pull_request' && format('e2e-pr-{0}', github.ref) || 'e2e-mq-serial' }} cancel-in-progress: ${{ github.event_name == 'pull_request' }} jobs: diff --git a/tests/e2e/common/retry_test_mixins.py b/tests/e2e/common/retry_test_mixins.py index af635331d..08876e145 100755 --- a/tests/e2e/common/retry_test_mixins.py +++ b/tests/e2e/common/retry_test_mixins.py @@ -17,6 +17,7 @@ ) from databricks.sql.telemetry.telemetry_client import ( NoopTelemetryClient, + TelemetryClient, TelemetryClientFactory, _TelemetryClientHolder, ) @@ -27,8 +28,22 @@ def _isolated_from_telemetry(): # Tests that mock urllib3 globally (via _get_conn / _validate_conn) also # intercept background telemetry pushes from the shared # TelemetryClientFactory executor — inflating mock.call_count and breaking - # assertions like `call_count == 6`. Drain the factory and force any new - # connection to use NoopTelemetryClient for the duration of the test. + # assertions like `call_count == 6`. Three layers of defence: + # + # 1. Drain TelemetryClientFactory and override initialize_telemetry_client + # so new connections install NoopTelemetryClient (which submits nothing). + # 2. Patch TelemetryClient._send_telemetry to a no-op as a backstop — covers + # any real TelemetryClient instance that slips in (e.g. a stale module- + # global, a code path that bypasses initialize_telemetry_client, or + # anything created before this context entered). + # 3. Patch TelemetryClient._export_event to a no-op so even if a real + # client receives an event, the event never reaches the queue and the + # flush logic never fires. + # + # Without layer 2/3 we have observed `/telemetry-ext` requests showing up + # in merge_group runs (concurrent CI load on the warehouse stresses paths + # that single-test runs don't hit), inflating retry-test counts and + # producing flaky AssertionErrors. with TelemetryClientFactory._lock: saved_clients = TelemetryClientFactory._clients saved_executor = TelemetryClientFactory._executor @@ -55,11 +70,21 @@ def _install_noop(*args, host_url=None, **kwargs): NoopTelemetryClient() ) + def _noop_send(self, events): + return None + + def _noop_export(self, event): + return None + try: with patch.object( TelemetryClientFactory, "initialize_telemetry_client", staticmethod(_install_noop), + ), patch.object( + TelemetryClient, "_send_telemetry", _noop_send + ), patch.object( + TelemetryClient, "_export_event", _noop_export ): yield finally: