From 6ef2a7e63bd92aa0903608ebc631fe7b201a624b Mon Sep 17 00:00:00 2001
From: ghostiee-11 <aman0611kumar@gmail.com>
Date: Thu, 2 Jul 2026 20:39:54 +0530
Subject: [PATCH 1/2] fix: register dimensions without coordinates as columns
 (closes #203)

A Dataset can have "dimensions without coordinates" (e.g. an image's
channel/height/width axes). These are absent from ds.coords, so _parse_schema
dropped them from the SQL schema and SELECT * returned too few columns. Even if
they were added, slicing a block with isel synthesizes their index relative to
the block (restarting at 0 per partition), which would corrupt values on chunked
reads.

Materialise a default integer index coordinate for every coordinate-less
dimension once, at the reader entry points (read_xarray_table and
XarrayRecordBatchReader), via a new ensure_default_indexes() helper. This turns
them into ordinary dimension coordinates so the existing schema, record-batch,
filter-pushdown and round-trip machinery handles them correctly and they carry
their absolute position through chunked reads.

Adds regression tests for the schema, both record-batch builders (chunked along
a coordinate-less dim), and the from_dataset query path.
---
 tests/test_df.py     |  79 ++++++++++++++++++++++++++++++
 tests/test_sql.py    | 113 +++++++++++++++++++++++++++++++++++++++++++
 xarray_sql/df.py     |  23 +++++++++
 xarray_sql/reader.py |   3 ++
 4 files changed, 218 insertions(+)

diff --git a/tests/test_df.py b/tests/test_df.py
index 2c81144..4496bd3 100644
--- a/tests/test_df.py
+++ b/tests/test_df.py
@@ -12,6 +12,7 @@
     block_slices,
     compute_chunks,
     dataset_to_record_batch,
+    ensure_default_indexes,
     explode,
     from_map,
     from_map_batched,
@@ -244,6 +245,84 @@ def test_dataset_to_record_batch_row_count(air_small):
         assert batch.num_rows == expected_rows
 
 
+def _coordless_ds():
+    """A 2-D dataset whose dimensions have NO coordinates (issue #203)."""
+    return xr.Dataset(
+        {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))}
+    )
+
+
+def test_ensure_default_indexes_materializes_missing_coords():
+    """ensure_default_indexes gives coordinate-less dims a 0..N-1 index and
+    leaves already-coordinated dims untouched (issue #203)."""
+    ds = _coordless_ds()
+    assert "x" not in ds.coords and "y" not in ds.coords
+
+    fixed = ensure_default_indexes(ds)
+    assert list(fixed.coords["x"].values) == [0, 1, 2]
+    assert list(fixed.coords["y"].values) == [0, 1]
+
+    already = xr.Dataset(
+        {"a": (("x",), np.arange(3))}, coords={"x": [10, 11, 12]}
+    )
+    assert list(ensure_default_indexes(already).coords["x"].values) == [
+        10,
+        11,
+        12,
+    ]
+
+
+def test_parse_schema_includes_coordless_dims_after_materialize():
+    """Once default indexes are materialised, every dimension is a column."""
+    schema = _parse_schema(ensure_default_indexes(_coordless_ds()))
+    assert set(schema.names) == {"x", "y", "a"}
+    # Coordinate-less dims fall back to xarray's default integer index.
+    assert pa.types.is_integer(schema.field("x").type)
+    assert pa.types.is_integer(schema.field("y").type)
+
+
+def test_record_batch_paths_carry_absolute_coordless_index():
+    """With materialised indexes, both record-batch builders emit the ABSOLUTE
+    0..N-1 position for a coordinate-less dim, even when that dim is chunked so
+    each block is sliced separately (issue #203)."""
+    ds = ensure_default_indexes(_coordless_ds())
+    schema = _parse_schema(ds)
+
+    # Chunk along the coordinate-less 'x' dim: each block is a separate slice,
+    # which is exactly where a block-relative index would corrupt the values.
+    frames_iter, frames_single = [], []
+    for block in block_slices(ds, chunks={"x": 1}):
+        ds_block = ds.isel(block)
+        frames_iter.append(
+            pa.Table.from_batches(
+                list(iter_record_batches(ds_block, schema))
+            ).to_pandas()
+        )
+        frames_single.append(
+            dataset_to_record_batch(ds_block, schema).to_pandas()
+        )
+
+    got_iter = (
+        pd.concat(frames_iter).sort_values(["x", "y"]).reset_index(drop=True)
+    )
+    got_single = (
+        pd.concat(frames_single).sort_values(["x", "y"]).reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(got_iter, got_single)
+
+    # Absolute (not block-relative) index: x spans 0..2 across the 3 chunks.
+    assert sorted(got_iter["x"].unique().tolist()) == [0, 1, 2]
+    assert sorted(got_iter["y"].unique().tolist()) == [0, 1]
+
+    expected = (
+        ds.to_dataframe()
+        .reset_index()
+        .sort_values(["x", "y"])
+        .reset_index(drop=True)
+    )
+    pd.testing.assert_frame_equal(got_iter, expected, check_dtype=False)
+
+
 def test_from_map_batched_basic_functionality(air_small):
     blocks = list(
         block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4})
diff --git a/tests/test_sql.py b/tests/test_sql.py
index 857a7bc..af0c590 100644
--- a/tests/test_sql.py
+++ b/tests/test_sql.py
@@ -488,3 +488,116 @@ def test_table_names_is_keyword_only(self, mixed_ds):
         ctx = XarrayContext()
         with pytest.raises(TypeError):
             ctx.from_dataset("era5", mixed_ds, {("time",): "x"})
+
+    @pytest.fixture
+    def coordless_dims_ds(self):
+        """Mirror the fashion-mnist layout from issue #203: a dimension
+        coordinate (``sample``) alongside dimensions without coordinates
+        (``channel``/``height``/``width``)."""
+        n_sample, n_channel, n_height, n_width = 4, 1, 3, 3
+        return xr.Dataset(
+            {
+                "images": (
+                    ["sample", "channel", "height", "width"],
+                    np.arange(
+                        n_sample * n_channel * n_height * n_width,
+                        dtype="float32",
+                    ).reshape(n_sample, n_channel, n_height, n_width),
+                ),
+                "labels": (["sample"], np.arange(n_sample, dtype="int64")),
+            },
+            coords={"sample": ("sample", np.arange(n_sample, dtype="int64"))},
+        ).chunk({"sample": 1})
+
+    def test_coordless_dims_appear_as_columns(self, coordless_dims_ds):
+        """Regression for #203: dimensions without coordinates must still be
+        emitted as columns, not silently dropped from the schema."""
+        ctx = XarrayContext()
+        ctx.from_dataset(
+            "mnist",
+            coordless_dims_ds,
+            table_names={
+                ("sample", "channel", "height", "width"): "X",
+                ("sample",): "y",
+            },
+        )
+        result = ctx.sql('SELECT * FROM mnist."X"').to_pandas()
+        assert set(result.columns) == {
+            "sample",
+            "channel",
+            "height",
+            "width",
+            "images",
+        }
+
+    def test_coordless_dims_values_match_xarray(self, coordless_dims_ds):
+        """The X table's rows must match xarray's own pivot exactly, including
+        the synthesized index values for the coordinate-less dimensions."""
+        ctx = XarrayContext()
+        ctx.from_dataset(
+            "mnist",
+            coordless_dims_ds,
+            table_names={
+                ("sample", "channel", "height", "width"): "X",
+                ("sample",): "y",
+            },
+        )
+        dim_cols = ["sample", "channel", "height", "width"]
+        result = (
+            ctx.sql('SELECT * FROM mnist."X"')
+            .to_pandas()
+            .sort_values(dim_cols)
+            .reset_index(drop=True)
+        )
+        expected = (
+            coordless_dims_ds[["images"]]
+            .to_dataframe()
+            .reset_index()
+            .sort_values(dim_cols)
+            .reset_index(drop=True)
+        )
+        pd.testing.assert_frame_equal(
+            result, expected, check_dtype=False, check_like=True
+        )
+
+    def test_coordless_dims_y_table_unaffected(self, coordless_dims_ds):
+        """The 1-D ``y`` group (sample coordinate + labels) is unchanged."""
+        ctx = XarrayContext()
+        ctx.from_dataset(
+            "mnist",
+            coordless_dims_ds,
+            table_names={
+                ("sample", "channel", "height", "width"): "X",
+                ("sample",): "y",
+            },
+        )
+        result = ctx.sql('SELECT * FROM mnist."y"').to_pandas()
+        assert set(result.columns) == {"sample", "labels"}
+        assert len(result) == coordless_dims_ds.sizes["sample"]
+
+    def test_single_table_all_coordless_dims(self):
+        """A uniform-dim dataset whose dims lack coordinates registers as one
+        table with every dimension present as a column, and the coordinate-less
+        dimensions carry their ABSOLUTE index even when chunked (issue #203)."""
+        ds = xr.Dataset(
+            {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))}
+        ).chunk({"x": 1})  # chunked along the coordinate-less 'x' dim
+        ctx = XarrayContext()
+        ctx.from_dataset("grid", ds)
+        result = (
+            ctx.sql("SELECT * FROM grid")
+            .to_pandas()
+            .sort_values(["x", "y"])
+            .reset_index(drop=True)
+        )
+        assert set(result.columns) == {"x", "y", "a"}
+        # x must span 0..2 across the three chunks, not restart at 0 each block.
+        expected = (
+            ds.to_dataframe()
+            .reset_index()
+            .sort_values(["x", "y"])
+            .reset_index(drop=True)
+        )
+        pd.testing.assert_frame_equal(
+            result, expected, check_dtype=False, check_like=True
+        )
diff --git a/xarray_sql/df.py b/xarray_sql/df.py
index 99c8408..7c38bad 100644
--- a/xarray_sql/df.py
+++ b/xarray_sql/df.py
@@ -73,6 +73,24 @@ def resolve_chunks(
     return {d: tuple(c) for d, c in ds.chunks.items()}
 
 
+def ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset:
+    """Attach a default integer index coordinate to every dimension lacking one.
+
+    xarray allows "dimensions without coordinates"; these are absent from
+    ``ds.coords``, so they are dropped from the SQL schema and, once a block is
+    sliced out with ``isel``, their position is synthesized *relative to the
+    block* (restarting at 0 in every partition). Materialising an explicit
+    ``arange`` index up front turns them into ordinary dimension coordinates, so
+    they appear as columns and carry their absolute position through chunked
+    reads (issue #203). Datasets whose dimensions already have coordinates are
+    returned unchanged.
+    """
+    missing = {
+        dim: np.arange(ds.sizes[dim]) for dim in ds.dims if dim not in ds.coords
+    }
+    return ds.assign_coords(missing) if missing else ds
+
+
 def _block_slices_from_resolved(
     ds: xr.Dataset, resolved: Mapping[Hashable, tuple[int, ...]]
 ) -> Iterator[Block]:
@@ -371,6 +389,11 @@ def iter_record_batches(
 def _parse_schema(ds: xr.Dataset) -> pa.Schema:
     """Extracts a `pa.Schema` from the Dataset, treating dims and data_vars as columns.
 
+    Only *dimension coordinates* become dimension columns, so a dimension
+    without a coordinate would be dropped. Callers must run the Dataset through
+    :func:`ensure_default_indexes` first (the readers do) so every dimension has
+    a coordinate and appears as a column (issue #203).
+
     Uses the xarray index type to detect cftime coordinates without
     materializing their data — important for Dask/Zarr-backed datasets
     where .values would trigger eager computation.
diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py
index bfb7be2..65aefec 100644
--- a/xarray_sql/reader.py
+++ b/xarray_sql/reader.py
@@ -26,6 +26,7 @@
     _block_slices_from_resolved,
     _parse_schema,
     block_slices,
+    ensure_default_indexes,
     iter_record_batches,
     resolve_chunks,
 )
@@ -84,6 +85,7 @@ def __init__(
                 each block dict just before it's converted to Arrow. This
                 allows tests to track when iteration actually occurs.
         """
+        ds = ensure_default_indexes(ds)
         self._ds = ds
         self._chunks = chunks
         self._batch_size = batch_size
@@ -254,6 +256,7 @@ def read_xarray_table(
     """
     from ._native import LazyArrowStreamTable
 
+    ds = ensure_default_indexes(ds)
     schema = _parse_schema(ds)
 
     # Hoist coordinate reads once; avoids N_partitions remote I/O calls for

From 91c7f8c07fb882894e7fffc4886f937c6a775f52 Mon Sep 17 00:00:00 2001
From: ghostiee-11 <aman0611kumar@gmail.com>
Date: Thu, 2 Jul 2026 22:12:15 +0530
Subject: [PATCH 2/2] refactor: address review feedback on the coordless-dim
 fix

Privatise the index helper (_ensure_default_indexes), move its call from the
XarrayRecordBatchReader constructor to read_xarray, drop the df-level tests
(the public contract is covered by the sql tests), and remove the GitHub issue
references from code and test docstrings.
---
 tests/test_df.py     | 79 --------------------------------------------
 tests/test_sql.py    |  8 ++---
 xarray_sql/df.py     | 10 +++---
 xarray_sql/reader.py |  6 ++--
 4 files changed, 12 insertions(+), 91 deletions(-)

diff --git a/tests/test_df.py b/tests/test_df.py
index 4496bd3..2c81144 100644
--- a/tests/test_df.py
+++ b/tests/test_df.py
@@ -12,7 +12,6 @@
     block_slices,
     compute_chunks,
     dataset_to_record_batch,
-    ensure_default_indexes,
     explode,
     from_map,
     from_map_batched,
@@ -245,84 +244,6 @@ def test_dataset_to_record_batch_row_count(air_small):
         assert batch.num_rows == expected_rows
 
 
-def _coordless_ds():
-    """A 2-D dataset whose dimensions have NO coordinates (issue #203)."""
-    return xr.Dataset(
-        {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))}
-    )
-
-
-def test_ensure_default_indexes_materializes_missing_coords():
-    """ensure_default_indexes gives coordinate-less dims a 0..N-1 index and
-    leaves already-coordinated dims untouched (issue #203)."""
-    ds = _coordless_ds()
-    assert "x" not in ds.coords and "y" not in ds.coords
-
-    fixed = ensure_default_indexes(ds)
-    assert list(fixed.coords["x"].values) == [0, 1, 2]
-    assert list(fixed.coords["y"].values) == [0, 1]
-
-    already = xr.Dataset(
-        {"a": (("x",), np.arange(3))}, coords={"x": [10, 11, 12]}
-    )
-    assert list(ensure_default_indexes(already).coords["x"].values) == [
-        10,
-        11,
-        12,
-    ]
-
-
-def test_parse_schema_includes_coordless_dims_after_materialize():
-    """Once default indexes are materialised, every dimension is a column."""
-    schema = _parse_schema(ensure_default_indexes(_coordless_ds()))
-    assert set(schema.names) == {"x", "y", "a"}
-    # Coordinate-less dims fall back to xarray's default integer index.
-    assert pa.types.is_integer(schema.field("x").type)
-    assert pa.types.is_integer(schema.field("y").type)
-
-
-def test_record_batch_paths_carry_absolute_coordless_index():
-    """With materialised indexes, both record-batch builders emit the ABSOLUTE
-    0..N-1 position for a coordinate-less dim, even when that dim is chunked so
-    each block is sliced separately (issue #203)."""
-    ds = ensure_default_indexes(_coordless_ds())
-    schema = _parse_schema(ds)
-
-    # Chunk along the coordinate-less 'x' dim: each block is a separate slice,
-    # which is exactly where a block-relative index would corrupt the values.
-    frames_iter, frames_single = [], []
-    for block in block_slices(ds, chunks={"x": 1}):
-        ds_block = ds.isel(block)
-        frames_iter.append(
-            pa.Table.from_batches(
-                list(iter_record_batches(ds_block, schema))
-            ).to_pandas()
-        )
-        frames_single.append(
-            dataset_to_record_batch(ds_block, schema).to_pandas()
-        )
-
-    got_iter = (
-        pd.concat(frames_iter).sort_values(["x", "y"]).reset_index(drop=True)
-    )
-    got_single = (
-        pd.concat(frames_single).sort_values(["x", "y"]).reset_index(drop=True)
-    )
-    pd.testing.assert_frame_equal(got_iter, got_single)
-
-    # Absolute (not block-relative) index: x spans 0..2 across the 3 chunks.
-    assert sorted(got_iter["x"].unique().tolist()) == [0, 1, 2]
-    assert sorted(got_iter["y"].unique().tolist()) == [0, 1]
-
-    expected = (
-        ds.to_dataframe()
-        .reset_index()
-        .sort_values(["x", "y"])
-        .reset_index(drop=True)
-    )
-    pd.testing.assert_frame_equal(got_iter, expected, check_dtype=False)
-
-
 def test_from_map_batched_basic_functionality(air_small):
     blocks = list(
         block_slices(air_small, chunks={"time": 4, "lat": 3, "lon": 4})
diff --git a/tests/test_sql.py b/tests/test_sql.py
index af0c590..746dfb3 100644
--- a/tests/test_sql.py
+++ b/tests/test_sql.py
@@ -491,7 +491,7 @@ def test_table_names_is_keyword_only(self, mixed_ds):
 
     @pytest.fixture
     def coordless_dims_ds(self):
-        """Mirror the fashion-mnist layout from issue #203: a dimension
+        """Mirror the fashion-mnist layout: a dimension
         coordinate (``sample``) alongside dimensions without coordinates
         (``channel``/``height``/``width``)."""
         n_sample, n_channel, n_height, n_width = 4, 1, 3, 3
@@ -510,8 +510,8 @@ def coordless_dims_ds(self):
         ).chunk({"sample": 1})
 
     def test_coordless_dims_appear_as_columns(self, coordless_dims_ds):
-        """Regression for #203: dimensions without coordinates must still be
-        emitted as columns, not silently dropped from the schema."""
+        """Dimensions without coordinates must still be emitted as columns,
+        not silently dropped from the schema."""
         ctx = XarrayContext()
         ctx.from_dataset(
             "mnist",
@@ -578,7 +578,7 @@ def test_coordless_dims_y_table_unaffected(self, coordless_dims_ds):
     def test_single_table_all_coordless_dims(self):
         """A uniform-dim dataset whose dims lack coordinates registers as one
         table with every dimension present as a column, and the coordinate-less
-        dimensions carry their ABSOLUTE index even when chunked (issue #203)."""
+        dimensions carry their ABSOLUTE index even when chunked."""
         ds = xr.Dataset(
             {"a": (("x", "y"), np.arange(6, dtype="float32").reshape(3, 2))}
         ).chunk({"x": 1})  # chunked along the coordinate-less 'x' dim
diff --git a/xarray_sql/df.py b/xarray_sql/df.py
index 7c38bad..ab80056 100644
--- a/xarray_sql/df.py
+++ b/xarray_sql/df.py
@@ -73,7 +73,7 @@ def resolve_chunks(
     return {d: tuple(c) for d, c in ds.chunks.items()}
 
 
-def ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset:
+def _ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset:
     """Attach a default integer index coordinate to every dimension lacking one.
 
     xarray allows "dimensions without coordinates"; these are absent from
@@ -82,8 +82,8 @@ def ensure_default_indexes(ds: xr.Dataset) -> xr.Dataset:
     block* (restarting at 0 in every partition). Materialising an explicit
     ``arange`` index up front turns them into ordinary dimension coordinates, so
     they appear as columns and carry their absolute position through chunked
-    reads (issue #203). Datasets whose dimensions already have coordinates are
-    returned unchanged.
+    reads. Datasets whose dimensions already have coordinates are returned
+    unchanged.
     """
     missing = {
         dim: np.arange(ds.sizes[dim]) for dim in ds.dims if dim not in ds.coords
@@ -391,8 +391,8 @@ def _parse_schema(ds: xr.Dataset) -> pa.Schema:
 
     Only *dimension coordinates* become dimension columns, so a dimension
     without a coordinate would be dropped. Callers must run the Dataset through
-    :func:`ensure_default_indexes` first (the readers do) so every dimension has
-    a coordinate and appears as a column (issue #203).
+    :func:`_ensure_default_indexes` first (the readers do) so every dimension
+    has a coordinate and appears as a column.
 
     Uses the xarray index type to detect cftime coordinates without
     materializing their data — important for Dask/Zarr-backed datasets
diff --git a/xarray_sql/reader.py b/xarray_sql/reader.py
index 65aefec..153ed8c 100644
--- a/xarray_sql/reader.py
+++ b/xarray_sql/reader.py
@@ -24,9 +24,9 @@
     _block_len,
     _block_metadata,
     _block_slices_from_resolved,
+    _ensure_default_indexes,
     _parse_schema,
     block_slices,
-    ensure_default_indexes,
     iter_record_batches,
     resolve_chunks,
 )
@@ -85,7 +85,6 @@ def __init__(
                 each block dict just before it's converted to Arrow. This
                 allows tests to track when iteration actually occurs.
         """
-        ds = ensure_default_indexes(ds)
         self._ds = ds
         self._chunks = chunks
         self._batch_size = batch_size
@@ -187,6 +186,7 @@ def read_xarray(ds: xr.Dataset, chunks: Chunks = None) -> pa.RecordBatchReader:
       A PyArrow RecordBatchReader, which is a table representation of the input
       Dataset.
     """
+    ds = _ensure_default_indexes(ds)
     reader = XarrayRecordBatchReader(ds, chunks=chunks)
     return pa.RecordBatchReader.from_stream(reader)
 
@@ -256,7 +256,7 @@ def read_xarray_table(
     """
     from ._native import LazyArrowStreamTable
 
-    ds = ensure_default_indexes(ds)
+    ds = _ensure_default_indexes(ds)
     schema = _parse_schema(ds)
 
     # Hoist coordinate reads once; avoids N_partitions remote I/O calls for