From f2a2503ce12539721d5a33fb966f0e9abb30ac9e Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Tue, 2 Jun 2026 21:06:05 -0400 Subject: [PATCH] fix(nwis): forward get_record state, fix major-filter validation, reject bad format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several fixes in the deprecated `nwis` module, surfaced by the package review. - `get_record(state=…)` was accepted and documented but never forwarded. It now reaches the request as the NWIS `stateCd` major filter; previously it was silently dropped, producing a confusing "Bad Request" when used without `sites`. - The "must specify a major filter" validation was defeated: callers injected a filter key unconditionally (e.g. `kwargs["sites"] = kwargs.pop("sites", sites)` set the key to None when absent), so the membership-based guards in `query_waterservices`/`query_waterdata` always passed and a filterless request went out as a confusing "Bad Request" instead of the intended TypeError. The guards now require a filter that is present *and* non-None, rejecting an unset filter at the chokepoint — no per-getter pre-filtering needed. (`utils.query` already strips None-valued params, so this is purely a validation change.) - `get_dv`/`get_iv`/`get_discharge_peaks`/`get_stats` each parse a fixed response body but passed `format=` explicitly alongside `**kwargs`, so e.g. `get_dv(sites=…, format="rdb")` raised "multiple values for 'format'". A caller-supplied non-native `format` is now rejected with a clear ValueError via the shared `_reject_unexpected_format` helper (json for dv/iv, rdb for peaks/stats). - `get_ratings` is per-site; called without one it issued a request returning an unhelpful error page. It now fails fast with a clear TypeError (also covering `get_record(service="ratings")`). Adds regression tests for each. Full nwis suite passes; ruff and mypy --strict clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- dataretrieval/nwis.py | 48 ++++++++++++++++-- tests/nwis_test.py | 110 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 4 deletions(-) diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py index 756d95f7..15b255ee 100644 --- a/dataretrieval/nwis.py +++ b/dataretrieval/nwis.py @@ -126,6 +126,21 @@ def _parse_json_or_raise(response: httpx.Response) -> pd.DataFrame: raise +def _reject_unexpected_format( + kwargs: dict[str, Any], func_name: str, expected: str +) -> None: + """Drop ``format`` from ``kwargs`` and reject any value other than ``expected``. + + These getters always request a fixed ``format`` and parse that specific body, + so a caller-supplied ``format`` would either collide with the explicit + ``format=`` argument or be silently overridden; reject it explicitly instead. + """ + if kwargs.pop("format", expected) != expected: + raise ValueError( + f"{func_name} returns {expected.upper()} and does not accept a `format`." + ) + + def format_response( df: pd.DataFrame, service: str | None = None, **kwargs: Any ) -> pd.DataFrame: @@ -277,6 +292,7 @@ def get_discharge_peaks( kwargs["end_date"] = kwargs.pop("end_date", end) kwargs["multi_index"] = multi_index + _reject_unexpected_format(kwargs, "get_discharge_peaks", "rdb") response = query_waterdata("peaks", format="rdb", ssl_check=ssl_check, **kwargs) # Parse raw (read_rdb), not _read_rdb — the latter already runs @@ -355,6 +371,8 @@ def get_stats( """ _check_sites_value_types(sites) + # get_stats parses an RDB body; reject a caller-supplied non-RDB `format`. + _reject_unexpected_format(kwargs, "get_stats", "rdb") response = query_waterservices( service="stat", sites=sites, ssl_check=ssl_check, **kwargs ) @@ -392,11 +410,13 @@ def query_waterdata( "se_latitude_va", ] - if not any(key in kwargs for key in major_params + bbox_params): + # Present *and* non-None — see query_waterservices for why membership alone + # would let an unset filter slip through. + if not any(kwargs.get(key) is not None for key in major_params + bbox_params): raise TypeError("Query must specify a major filter: site_no, stateCd, bBox") - elif any(key in kwargs for key in bbox_params) and not all( - key in kwargs for key in bbox_params + elif any(kwargs.get(key) is not None for key in bbox_params) and not all( + kwargs.get(key) is not None for key in bbox_params ): raise TypeError("One or more lat/long coordinates missing or invalid.") @@ -453,8 +473,15 @@ def query_waterservices( The response object from the API request to the web service """ + # A major filter must be present *and* non-None. Membership alone is not + # enough: callers may inject an unset filter (e.g. sites=None), and + # utils.query() later strips None-valued params, so a plain `"sites" in + # kwargs` would wave a filterless request through to a confusing "Bad + # Request". Checking the value keeps that decision in one place instead of + # forcing every getter to pre-filter its own kwargs. if not any( - key in kwargs for key in ["sites", "stateCd", "bBox", "huc", "countyCd"] + kwargs.get(key) is not None + for key in ["sites", "stateCd", "bBox", "huc", "countyCd"] ): raise TypeError( "Query must specify a major filter: sites, stateCd, bBox, huc, or countyCd" @@ -537,6 +564,7 @@ def get_dv( kwargs["sites"] = kwargs.pop("sites", sites) kwargs["multi_index"] = multi_index + _reject_unexpected_format(kwargs, "get_dv", "json") response = query_waterservices("dv", format="json", ssl_check=ssl_check, **kwargs) df = _parse_json_or_raise(response) @@ -724,6 +752,7 @@ def get_iv( kwargs["sites"] = kwargs.pop("sites", sites) kwargs["multi_index"] = multi_index + _reject_unexpected_format(kwargs, "get_iv", "json") response = query_waterservices( service="iv", format="json", ssl_check=ssl_check, **kwargs ) @@ -788,6 +817,11 @@ def get_ratings( """ site = kwargs.pop("site_no", site) + # The ratings endpoint is per-site; without one it would issue a request + # that returns an unhelpful error page. Fail fast with a clear message + # (also covers get_record(service="ratings") called without a site). + if site is None: + raise TypeError("get_ratings requires a `site` (USGS site number).") payload = {} url = WATERDATA_BASE_URL + "nwisweb/get_ratings/" @@ -956,6 +990,12 @@ def get_record( if service not in WATERSERVICES_SERVICES + WATERDATA_SERVICES: raise TypeError(f"Unrecognized service: {service}") + # Forward the documented `state` filter as the NWIS `stateCd` major filter; + # it was previously accepted but silently ignored, which produced a + # confusing "Bad Request" when used without `sites`. + if state is not None: + kwargs.setdefault("stateCd", state) + if service == "iv": df, _ = get_iv( sites=sites, diff --git a/tests/nwis_test.py b/tests/nwis_test.py index f343f26e..2508b526 100644 --- a/tests/nwis_test.py +++ b/tests/nwis_test.py @@ -13,12 +13,15 @@ NWIS_Metadata, _read_rdb, get_discharge_measurements, + get_discharge_peaks, + get_dv, get_gwlevels, get_info, get_iv, get_pmcodes, get_qwdata, get_record, + get_stats, get_water_use, preformat_peaks_response, what_sites, @@ -120,6 +123,113 @@ def test_preformat_peaks_response(): # Removed defunct gwlevels tests. +def test_get_dv_requires_major_filter(): + """Regression: get_dv() with no major filter must raise the documented + TypeError. The getters injected ``kwargs["sites"] = kwargs.pop("sites", + sites)``, which always set the key (None when absent) and so defeated + query_waterservices' filter check, yielding a confusing Bad Request.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(TypeError, match="major filter"): + get_dv() + + +def test_get_dv_rejects_non_json_format(): + """Regression: get_dv passed format= explicitly alongside **kwargs, so + get_dv(..., format="rdb") raised 'multiple values for format'. It now + rejects a non-json format with a clear ValueError (it parses JSON).""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(ValueError, match="JSON"): + get_dv(sites="01646500", format="rdb") + + +def test_get_record_forwards_state_as_statecd(monkeypatch): + """Regression: get_record's documented `state` arg was accepted but never + forwarded. It now reaches the request as the NWIS `stateCd` major filter.""" + import dataretrieval.nwis as nwis_mod + + captured: dict = {} + + def fake_query_waterservices(service, format=None, ssl_check=True, **kw): + captured.update(kw) + raise RuntimeError("stop before network") + + monkeypatch.setattr(nwis_mod, "query_waterservices", fake_query_waterservices) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(RuntimeError, match="stop"): + get_record(state="OH", service="dv") + assert captured.get("stateCd") == "OH" + + +def test_get_stats_requires_major_filter(): + """Regression: get_stats passed ``sites=sites`` explicitly, so the key was + always present (None when absent) and defeated query_waterservices' filter + check -- get_stats() reached the network and returned a confusing Bad + Request instead of the documented TypeError.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(TypeError, match="major filter"): + get_stats() + + +@pytest.mark.parametrize("service", ["stat", "site"]) +def test_get_record_requires_major_filter(service): + """Regression: get_record(service="stat"/"site") forwarded sites=None into + get_stats/get_info, defeating the major-filter check. With no filter it must + raise the documented TypeError rather than reach the network.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(TypeError, match="major filter"): + get_record(service=service) + + +def test_get_discharge_peaks_rejects_format(): + """Regression: get_discharge_peaks passed format="rdb" explicitly alongside + **kwargs, so any caller-supplied format raised 'multiple values for format'. + A non-native format is now rejected with a clear ValueError (it parses RDB).""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(ValueError, match="RDB"): + get_discharge_peaks(sites="01491000", format="json") + + +def test_get_discharge_peaks_accepts_native_format(monkeypatch): + """The reported collision was that even the *native* format="rdb" raised + 'multiple values for format'. Popping it first resolves the collision, so + format="rdb" now reaches the request rather than crashing.""" + import dataretrieval.nwis as nwis_mod + + def fake_query_waterdata(service, format=None, ssl_check=True, **kw): + raise RuntimeError("stop before network") + + monkeypatch.setattr(nwis_mod, "query_waterdata", fake_query_waterdata) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(RuntimeError, match="stop"): + get_discharge_peaks(sites="01491000", format="rdb") + + +def test_get_stats_rejects_non_rdb_format(): + """get_stats parses RDB; a caller-supplied non-RDB format would be requested + and then mis-parsed. It is now rejected with a clear ValueError.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(ValueError, match="RDB"): + get_stats(sites="01646500", format="json") + + +def test_get_ratings_requires_site(): + """The ratings endpoint is per-site; get_ratings() / get_record( + service="ratings") with no site previously issued a request that returned an + unhelpful error page. It now fails fast with a clear TypeError.""" + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + with pytest.raises(TypeError, match="requires a `site`"): + get_record(service="ratings") + + class TestDeprecationWarnings: """Verify per-function DeprecationWarning fires with the right replacement.