From 6ef2a7e63bd92aa0903608ebc631fe7b201a624b Mon Sep 17 00:00:00 2001 From: ghostiee-11 Date: Thu, 2 Jul 2026 20:39:54 +0530 Subject: [PATCH 1/2] fix: register dimensions without coordinates as columns (closes #203) A Dataset can have "dimensions without coordinates" (e.g. an image's channel/height/width axes). These are absent from ds.coords, so _parse_schema dropped them from the SQL schema and SELECT * returned too few columns. Even if they were added, slicing a block with isel synthesizes their index relative to the block (restarting at 0 per partition), which would corrupt values on chunked reads. Materialise a default integer index coordinate for every coordinate-less dimension once, at the reader entry points (read_xarray_table and XarrayRecordBatchReader), via a new ensure_default_indexes() helper. This turns them into ordinary dimension coordinates so the existing schema, record-batch, filter-pushdown and round-trip machinery handles them correctly and they carry their absolute position through chunked reads. Adds regression tests for the schema, both record-batch builders (chunked along a coordinate-less dim), and the from_dataset query path. --- tests/test_df.py | 79 ++++++++++++++++++++++++++++++ tests/test_sql.py | 113 +++++++++++++++++++++++++++++++++++++++++++ xarray_sql/df.py | 23 +++++++++ xarray_sql/reader.py | 3 ++ 4 files changed, 218 insertions(+) diff --git a/tests/test_df.py b/tests/test_df.py index 2c81144..4496bd3 100644 --- a/tests/test_df.py +++ b/tests/test_df.py @@ -12,6 +12,7 @@ block_slices, compute_chunks, dataset_to_record_batch, + ensure_default_indexes, explode, from_map, from_map_batched, @@ -244,6 +245,84 @@ def test_dataset_to_record_batch_row_count(air_small): assert batch.num_rows == expected_rows +def _coordless_ds(): + """A 2-D dataset whose dimensions have NO coordinates (issue #203).""" + return xr.Dataset( + {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))} + ) + + +def test_ensure_default_indexes_materializes_missing_coords(): + """ensure_default_indexes gives coordinate-less dims a 0..N-1 index and + leaves already-coordinated dims untouched (issue #203).""" + ds = _coordless_ds() + assert "x" not in ds.coords and "y" not in ds.coords + + fixed = ensure_default_indexes(ds) + assert list(fixed.coords["x"].values) == [0, 1, 2] + assert list(fixed.coords["y"].values) == [0, 1] + + already = xr.Dataset( + {"a": (("x",), np.arange(3))}, coords={"x": [10, 11, 12]} + ) + assert list(ensure_default_indexes(already).coords["x"].values) == [ + 10, + 11, + 12, + ] + + +def test_parse_schema_includes_coordless_dims_after_materialize(): + """Once default indexes are materialised, every dimension is a column.""" + schema = _parse_schema(ensure_default_indexes(_coordless_ds())) + assert set(schema.names) == {"x", "y", "a"} + # Coordinate-less dims fall back to xarray's default integer index. + assert pa.types.is_integer(schema.field("x").type) + assert pa.types.is_integer(schema.field("y").type) + + +def test_record_batch_paths_carry_absolute_coordless_index(): + """With materialised indexes, both record-batch builders emit the ABSOLUTE + 0..N-1 position for a coordinate-less dim, even when that dim is chunked so + each block is sliced separately (issue #203).""" + ds = ensure_default_indexes(_coordless_ds()) + schema = _parse_schema(ds) + + # Chunk along the coordinate-less 'x' dim: each block is a separate slice, + # which is exactly where a block-relative index would corrupt the values. + frames_iter, frames_single = [], [] + for block in block_slices(ds, chunks={"x": 1}): + ds_block = ds.isel(block) + frames_iter.append( + pa.Table.from_batches( + list(iter_record_batches(ds_block, schema)) + ).to_pandas() + ) + frames_single.append( + dataset_to_record_batch(ds_block, schema).to_pandas() + ) + + got_iter = ( + pd.concat(frames_iter).sort_values(["x", "y"]).reset_index(drop=True) + ) + got_single = ( + pd.concat(frames_single).sort_values(["x", "y"]).reset_index(drop=True) + ) + pd.testing.assert_frame_equal(got_iter, got_single) + + # Absolute (not block-relative) index: x spans 0..2 across the 3 chunks. + assert sorted(got_iter["x"].unique().tolist()) == [0, 1, 2] + assert sorted(got_iter["y"].unique().tolist()) == [0, 1] + + expected = ( + ds.to_dataframe() + .reset_index() + .sort_values(["x", "y"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal(got_iter, expected, check_dtype=False) + + def test_from_map_batched_basic_functionality(air_small): blocks = list( block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}) diff --git a/tests/test_sql.py b/tests/test_sql.py index 857a7bc..af0c590 100644 --- a/tests/test_sql.py +++ b/tests/test_sql.py @@ -488,3 +488,116 @@ def test_table_names_is_keyword_only(self, mixed_ds): ctx = XarrayContext() with pytest.raises(TypeError): ctx.from_dataset("era5", mixed_ds, {("time",): "x"}) + + @pytest.fixture + def coordless_dims_ds(self): + """Mirror the fashion-mnist layout from issue #203: a dimension + coordinate (``sample``) alongside dimensions without coordinates + (``channel``/``height``/``width``).""" + n_sample, n_channel, n_height, n_width = 4, 1, 3, 3 + return xr.Dataset( + { + "images": ( + ["sample", "channel", "height", "width"], + np.arange( + n_sample * n_channel * n_height * n_width, + dtype="float32", + ).reshape(n_sample, n_channel, n_height, n_width), + ), + "labels": (["sample"], np.arange(n_sample, dtype="int64")), + }, + coords={"sample": ("sample", np.arange(n_sample, dtype="int64"))}, + ).chunk({"sample": 1}) + + def test_coordless_dims_appear_as_columns(self, coordless_dims_ds): + """Regression for #203: dimensions without coordinates must still be + emitted as columns, not silently dropped from the schema.""" + ctx = XarrayContext() + ctx.from_dataset( + "mnist", + coordless_dims_ds, + table_names={ + ("sample", "channel", "height", "width"): "X", + ("sample",): "y", + }, + ) + result = ctx.sql('SELECT * FROM mnist."X"').to_pandas() + assert set(result.columns) == { + "sample", + "channel", + "height", + "width", + "images", + } + + def test_coordless_dims_values_match_xarray(self, coordless_dims_ds): + """The X table's rows must match xarray's own pivot exactly, including + the synthesized index values for the coordinate-less dimensions.""" + ctx = XarrayContext() + ctx.from_dataset( + "mnist", + coordless_dims_ds, + table_names={ + ("sample", "channel", "height", "width"): "X", + ("sample",): "y", + }, + ) + dim_cols = ["sample", "channel", "height", "width"] + result = ( + ctx.sql('SELECT * FROM mnist."X"') + .to_pandas() + .sort_values(dim_cols) + .reset_index(drop=True) + ) + expected = ( + coordless_dims_ds[["images"]] + .to_dataframe() + .reset_index() + .sort_values(dim_cols) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + result, expected, check_dtype=False, check_like=True + ) + + def test_coordless_dims_y_table_unaffected(self, coordless_dims_ds): + """The 1-D ``y`` group (sample coordinate + labels) is unchanged.""" + ctx = XarrayContext() + ctx.from_dataset( + "mnist", + coordless_dims_ds, + table_names={ + ("sample", "channel", "height", "width"): "X", + ("sample",): "y", + }, + ) + result = ctx.sql('SELECT * FROM mnist."y"').to_pandas() + assert set(result.columns) == {"sample", "labels"} + assert len(result) == coordless_dims_ds.sizes["sample"] + + def test_single_table_all_coordless_dims(self): + """A uniform-dim dataset whose dims lack coordinates registers as one + table with every dimension present as a column, and the coordinate-less + dimensions carry their ABSOLUTE index even when chunked (issue #203).""" + ds = xr.Dataset( + {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))} + ).chunk({"x": 1}) # chunked along the coordinate-less 'x' dim + ctx = XarrayContext() + ctx.from_dataset("grid", ds) + result = ( + ctx.sql("SELECT * FROM grid") + .to_pandas() + .sort_values(["x", "y"]) + .reset_index(drop=True) + ) + assert set(result.columns) == {"x", "y", "a"} + # x must span 0..2 across the three chunks, not restart at 0 each block. + expected = ( + ds.to_dataframe() + .reset_index() + .sort_values(["x", "y"]) + .reset_index(drop=True) + ) + pd.testing.assert_frame_equal( + result, expected, check_dtype=False, check_like=True + ) diff --git a/xarray_sql/df.py b/xarray_sql/df.py index 99c8408..7c38bad 100644 --- a/xarray_sql/df.py +++ b/xarray_sql/df.py @@ -73,6 +73,24 @@ def resolve_chunks( return {d: tuple(c) for d, c in ds.chunks.items()} +def ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset: + """Attach a default integer index coordinate to every dimension lacking one. + + xarray allows "dimensions without coordinates"; these are absent from + ``ds.coords``, so they are dropped from the SQL schema and, once a block is + sliced out with ``isel``, their position is synthesized *relative to the + block* (restarting at 0 in every partition). Materialising an explicit + ``arange`` index up front turns them into ordinary dimension coordinates, so + they appear as columns and carry their absolute position through chunked + reads (issue #203). Datasets whose dimensions already have coordinates are + returned unchanged. + """ + missing = { + dim: np.arange(ds.sizes[dim]) for dim in ds.dims if dim not in ds.coords + } + return ds.assign_coords(missing) if missing else ds + + def _block_slices_from_resolved( ds: xr.Dataset, resolved: Mapping[Hashable, tuple[int, ...]] ) -> Iterator[Block]: @@ -371,6 +389,11 @@ def iter_record_batches( def _parse_schema(ds: xr.Dataset) -> pa.Schema: """Extracts a `pa.Schema` from the Dataset, treating dims and data_vars as columns. + Only *dimension coordinates* become dimension columns, so a dimension + without a coordinate would be dropped. Callers must run the Dataset through + :func:`ensure_default_indexes` first (the readers do) so every dimension has + a coordinate and appears as a column (issue #203). + Uses the xarray index type to detect cftime coordinates without materializing their data — important for Dask/Zarr-backed datasets where .values would trigger eager computation. diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py index bfb7be2..65aefec 100644 --- a/xarray_sql/reader.py +++ b/xarray_sql/reader.py @@ -26,6 +26,7 @@ _block_slices_from_resolved, _parse_schema, block_slices, + ensure_default_indexes, iter_record_batches, resolve_chunks, ) @@ -84,6 +85,7 @@ def __init__( each block dict just before it's converted to Arrow. This allows tests to track when iteration actually occurs. """ + ds = ensure_default_indexes(ds) self._ds = ds self._chunks = chunks self._batch_size = batch_size @@ -254,6 +256,7 @@ def read_xarray_table( """ from ._native import LazyArrowStreamTable + ds = ensure_default_indexes(ds) schema = _parse_schema(ds) # Hoist coordinate reads once; avoids N_partitions remote I/O calls for From 91c7f8c07fb882894e7fffc4886f937c6a775f52 Mon Sep 17 00:00:00 2001 From: ghostiee-11 Date: Thu, 2 Jul 2026 22:12:15 +0530 Subject: [PATCH 2/2] refactor: address review feedback on the coordless-dim fix Privatise the index helper (_ensure_default_indexes), move its call from the XarrayRecordBatchReader constructor to read_xarray, drop the df-level tests (the public contract is covered by the sql tests), and remove the GitHub issue references from code and test docstrings. --- tests/test_df.py | 79 -------------------------------------------- tests/test_sql.py | 8 ++--- xarray_sql/df.py | 10 +++--- xarray_sql/reader.py | 6 ++-- 4 files changed, 12 insertions(+), 91 deletions(-) diff --git a/tests/test_df.py b/tests/test_df.py index 4496bd3..2c81144 100644 --- a/tests/test_df.py +++ b/tests/test_df.py @@ -12,7 +12,6 @@ block_slices, compute_chunks, dataset_to_record_batch, - ensure_default_indexes, explode, from_map, from_map_batched, @@ -245,84 +244,6 @@ def test_dataset_to_record_batch_row_count(air_small): assert batch.num_rows == expected_rows -def _coordless_ds(): - """A 2-D dataset whose dimensions have NO coordinates (issue #203).""" - return xr.Dataset( - {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))} - ) - - -def test_ensure_default_indexes_materializes_missing_coords(): - """ensure_default_indexes gives coordinate-less dims a 0..N-1 index and - leaves already-coordinated dims untouched (issue #203).""" - ds = _coordless_ds() - assert "x" not in ds.coords and "y" not in ds.coords - - fixed = ensure_default_indexes(ds) - assert list(fixed.coords["x"].values) == [0, 1, 2] - assert list(fixed.coords["y"].values) == [0, 1] - - already = xr.Dataset( - {"a": (("x",), np.arange(3))}, coords={"x": [10, 11, 12]} - ) - assert list(ensure_default_indexes(already).coords["x"].values) == [ - 10, - 11, - 12, - ] - - -def test_parse_schema_includes_coordless_dims_after_materialize(): - """Once default indexes are materialised, every dimension is a column.""" - schema = _parse_schema(ensure_default_indexes(_coordless_ds())) - assert set(schema.names) == {"x", "y", "a"} - # Coordinate-less dims fall back to xarray's default integer index. - assert pa.types.is_integer(schema.field("x").type) - assert pa.types.is_integer(schema.field("y").type) - - -def test_record_batch_paths_carry_absolute_coordless_index(): - """With materialised indexes, both record-batch builders emit the ABSOLUTE - 0..N-1 position for a coordinate-less dim, even when that dim is chunked so - each block is sliced separately (issue #203).""" - ds = ensure_default_indexes(_coordless_ds()) - schema = _parse_schema(ds) - - # Chunk along the coordinate-less 'x' dim: each block is a separate slice, - # which is exactly where a block-relative index would corrupt the values. - frames_iter, frames_single = [], [] - for block in block_slices(ds, chunks={"x": 1}): - ds_block = ds.isel(block) - frames_iter.append( - pa.Table.from_batches( - list(iter_record_batches(ds_block, schema)) - ).to_pandas() - ) - frames_single.append( - dataset_to_record_batch(ds_block, schema).to_pandas() - ) - - got_iter = ( - pd.concat(frames_iter).sort_values(["x", "y"]).reset_index(drop=True) - ) - got_single = ( - pd.concat(frames_single).sort_values(["x", "y"]).reset_index(drop=True) - ) - pd.testing.assert_frame_equal(got_iter, got_single) - - # Absolute (not block-relative) index: x spans 0..2 across the 3 chunks. - assert sorted(got_iter["x"].unique().tolist()) == [0, 1, 2] - assert sorted(got_iter["y"].unique().tolist()) == [0, 1] - - expected = ( - ds.to_dataframe() - .reset_index() - .sort_values(["x", "y"]) - .reset_index(drop=True) - ) - pd.testing.assert_frame_equal(got_iter, expected, check_dtype=False) - - def test_from_map_batched_basic_functionality(air_small): blocks = list( block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4}) diff --git a/tests/test_sql.py b/tests/test_sql.py index af0c590..746dfb3 100644 --- a/tests/test_sql.py +++ b/tests/test_sql.py @@ -491,7 +491,7 @@ def test_table_names_is_keyword_only(self, mixed_ds): @pytest.fixture def coordless_dims_ds(self): - """Mirror the fashion-mnist layout from issue #203: a dimension + """Mirror the fashion-mnist layout: a dimension coordinate (``sample``) alongside dimensions without coordinates (``channel``/``height``/``width``).""" n_sample, n_channel, n_height, n_width = 4, 1, 3, 3 @@ -510,8 +510,8 @@ def coordless_dims_ds(self): ).chunk({"sample": 1}) def test_coordless_dims_appear_as_columns(self, coordless_dims_ds): - """Regression for #203: dimensions without coordinates must still be - emitted as columns, not silently dropped from the schema.""" + """Dimensions without coordinates must still be emitted as columns, + not silently dropped from the schema.""" ctx = XarrayContext() ctx.from_dataset( "mnist", @@ -578,7 +578,7 @@ def test_coordless_dims_y_table_unaffected(self, coordless_dims_ds): def test_single_table_all_coordless_dims(self): """A uniform-dim dataset whose dims lack coordinates registers as one table with every dimension present as a column, and the coordinate-less - dimensions carry their ABSOLUTE index even when chunked (issue #203).""" + dimensions carry their ABSOLUTE index even when chunked.""" ds = xr.Dataset( {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))} ).chunk({"x": 1}) # chunked along the coordinate-less 'x' dim diff --git a/xarray_sql/df.py b/xarray_sql/df.py index 7c38bad..ab80056 100644 --- a/xarray_sql/df.py +++ b/xarray_sql/df.py @@ -73,7 +73,7 @@ def resolve_chunks( return {d: tuple(c) for d, c in ds.chunks.items()} -def ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset: +def _ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset: """Attach a default integer index coordinate to every dimension lacking one. xarray allows "dimensions without coordinates"; these are absent from @@ -82,8 +82,8 @@ def ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset: block* (restarting at 0 in every partition). Materialising an explicit ``arange`` index up front turns them into ordinary dimension coordinates, so they appear as columns and carry their absolute position through chunked - reads (issue #203). Datasets whose dimensions already have coordinates are - returned unchanged. + reads. Datasets whose dimensions already have coordinates are returned + unchanged. """ missing = { dim: np.arange(ds.sizes[dim]) for dim in ds.dims if dim not in ds.coords @@ -391,8 +391,8 @@ def _parse_schema(ds: xr.Dataset) -> pa.Schema: Only *dimension coordinates* become dimension columns, so a dimension without a coordinate would be dropped. Callers must run the Dataset through - :func:`ensure_default_indexes` first (the readers do) so every dimension has - a coordinate and appears as a column (issue #203). + :func:`_ensure_default_indexes` first (the readers do) so every dimension + has a coordinate and appears as a column. Uses the xarray index type to detect cftime coordinates without materializing their data — important for Dask/Zarr-backed datasets diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py index 65aefec..153ed8c 100644 --- a/xarray_sql/reader.py +++ b/xarray_sql/reader.py @@ -24,9 +24,9 @@ _block_len, _block_metadata, _block_slices_from_resolved, + _ensure_default_indexes, _parse_schema, block_slices, - ensure_default_indexes, iter_record_batches, resolve_chunks, ) @@ -85,7 +85,6 @@ def __init__( each block dict just before it's converted to Arrow. This allows tests to track when iteration actually occurs. """ - ds = ensure_default_indexes(ds) self._ds = ds self._chunks = chunks self._batch_size = batch_size @@ -187,6 +186,7 @@ def read_xarray(ds: xr.Dataset, chunks: Chunks = None) -> pa.RecordBatchReader: A PyArrow RecordBatchReader, which is a table representation of the input Dataset. """ + ds = _ensure_default_indexes(ds) reader = XarrayRecordBatchReader(ds, chunks=chunks) return pa.RecordBatchReader.from_stream(reader) @@ -256,7 +256,7 @@ def read_xarray_table( """ from ._native import LazyArrowStreamTable - ds = ensure_default_indexes(ds) + ds = _ensure_default_indexes(ds) schema = _parse_schema(ds) # Hoist coordinate reads once; avoids N_partitions remote I/O calls for