x-tabdeveloping · x-tabdeveloping · Jun 11, 2026 · Jun 10, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ profile = "black"
 
 [project]
 name = "turftopic"
-version = "0.25.2"
+version = "0.25.3"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = [
    { name = "Márton Kardos <power.up1163@gmail.com>", email = "martonkardos@cas.au.dk" }

diff --git a/turftopic/late.py b/turftopic/late.py
@@ -1,8 +1,10 @@
 import itertools
 import warnings
+from functools import partial
 from typing import Callable, Iterable, Optional, Union
 
 import numpy as np
+import scipy.sparse as spr
 import torch
 from sentence_transformers import SentenceTransformer
 from sklearn.base import TransformerMixin
@@ -208,29 +210,37 @@ def unflatten_repr(
     return repr
 
 
-def pool_flat(flat_repr: np.ndarray, lengths: Lengths, agg=np.nanmean):
+def pool_flat(
+    flat_repr: np.ndarray | spr.sparray, lengths: Lengths, agg=np.nanmean
+):
     """Pools vectors within documents using the agg function.
 
     Parameters
     ----------
-    flat_repr: ndarray of shape (n_total_tokens, n_dims)
+    flat_repr: ndarray or sparse array of shape (n_total_tokens, n_dims)
         Flattened document representations.
     lengths: Lengths
         Number of tokens in each document.
 
     Returns
     -------
-    ndarray of shape (n_documents, n_dims)
+    ndarray or sparse array of shape (n_documents, n_dims)
         Pooled representation for each document.
     """
+    if spr.issparse(flat_repr):
+        stack = partial(spr.vstack, format="csr")
+        array = spr.csr_matrix
+    else:
+        stack = np.stack
+        array = np.asarray
     pooled = []
     start_index = 0
     for length in lengths:
         pooled.append(
-            agg(flat_repr[start_index : start_index + length], axis=0)
+            array(agg(flat_repr[start_index : start_index + length], axis=0))
         )
         start_index += length
-    return np.stack(pooled)
+    return stack(pooled)
 
 
 def get_document_chunks(

diff --git a/turftopic/retrieval/__init__.py b/turftopic/retrieval/__init__.py
@@ -0,0 +1 @@
+from .bm25 import BM25Transformer
diff --git a/turftopic/retrieval/bm25.py b/turftopic/retrieval/bm25.py
@@ -0,0 +1,32 @@
+import numpy as np
+import scipy.sparse as spr
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class BM25Transformer(BaseEstimator, TransformerMixin):
+    def __init__(self, b: float = 0.7, k1: float = 8):
+        self.b = b
+        self.k1 = k1
+
+    def fit(self, X, y=None):
+        self.N_ = X.shape[0]
+        self.avgdl_ = X.sum(axis=1).mean()
+        self.term_freq_ = np.ravel(np.asarray((X > 0).sum(axis=0)))
+        self.idf_ = np.log(
+            (self.N_ - self.term_freq_ + 0.5) / (self.term_freq_ + 0.5)
+        )
+        return self
+
+    def transform(self, X):
+        if spr.issparse(X):
+            X = spr.csr_array(X)
+        d_len = np.ravel(np.asarray(X.sum(axis=1)))
+        K_D = 1 - self.b + self.b * d_len / self.avgdl_
+        return (
+            self.idf_[None, :]
+            * (X * (self.k1 + 1))
+            / (X + self.k1 * K_D[:, None])
+        )
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)
diff --git a/turftopic/serialization.py b/turftopic/serialization.py
@@ -77,7 +77,11 @@ def validate_package_versions(remote_versions: dict[str, str]):
 
 def create_readme(model, model_path: str) -> str:
     model_structure = str(model)
-    topics_table = model.export_topics(format="markdown", top_k=10)
+    try:
+        topics_table = model.export_topics(format="markdown", top_k=10)
+    except Exception:
+        print("Couldn't produce topic table for readme, moving on...")
+        topics_table = None
     local_versions = get_package_versions()
     lines = ["| Package | Version |", "| - | - |"]
     for package in IMPORTANT_PACKAGES:

diff --git a/turftopic/vectorizers/__init__.py b/turftopic/vectorizers/__init__.py
@@ -0,0 +1,3 @@
+from turftopic.vectorizers.latent_terms.latent_terms import (
+    LatentTermsVectorizer,
+)
diff --git a/turftopic/vectorizers/latent_terms/latent_terms.py b/turftopic/vectorizers/latent_terms/latent_terms.py
@@ -0,0 +1,112 @@
+import json
+import tempfile
+from pathlib import Path
+from typing import Union
+
+import joblib
+import numpy as np
+from huggingface_hub import HfApi
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from turftopic.late import (
+    LateSentenceTransformer,
+    flatten_repr,
+    pool_flat,
+)
+from turftopic.serialization import create_readme, get_package_versions
+from turftopic.vectorizers.latent_terms.top_k_autoencoder import (
+    TopKAutoEncoder,
+)
+
+
+class LatentTermsVectorizer(BaseEstimator, TransformerMixin):
+    def __init__(
+        self,
+        encoder: str | LateSentenceTransformer,
+        autoencoder: TopKAutoEncoder,
+        concept_labels: np.ndarray,
+        show_progress_bar: bool = True,
+    ):
+        self.encoder = encoder
+        if isinstance(self.encoder, str):
+            self._encoder = LateSentenceTransformer(self.encoder)
+        else:
+            self._encoder = self.encoder
+        self.concept_labels = np.array(concept_labels)
+        self.autoencoder = autoencoder
+        self.show_progress_bar = show_progress_bar
+        self.autoencoder.show_progress_bar = show_progress_bar
+
+    def fit(self, raw_documents, y=None):
+        # Does nothing, for compatibility
+        return self
+
+    def transform(self, raw_documents):
+        token_embeddings, offsets = self._encoder.encode_tokens(
+            list(raw_documents), show_progress_bar=self.show_progress_bar
+        )
+        flat_token_embeddings, lengths = flatten_repr(token_embeddings)
+        flat_z = self.autoencoder.transform(flat_token_embeddings)
+        # Pooling procedure from section 3.2
+        pooled_z = pool_flat(flat_z, lengths=lengths, agg=np.sum)
+        return np.sqrt(pooled_z)
+
+    def fit_transform(self, raw_documents, y=None):
+        return self.fit(raw_documents, y).transform(raw_documents)
+
+    def get_feature_names_out(self):
+        return self.concept_labels
+
+    @classmethod
+    def from_dict(cls, data):
+        autoencoder = TopKAutoEncoder.from_dict(data["autoencoder"])
+        return cls(
+            encoder=data["encoder"],
+            autoencoder=autoencoder,
+            show_progress_bar=data["show_progress_bar"],
+            concept_labels=data["concept_labels"],
+        )
+
+    def to_dict(self):
+        return dict(
+            encoder=self.encoder,
+            autoencoder=self.autoencoder.to_dict(),
+            show_progress_bar=self.show_progress_bar,
+            concept_labels=self.concept_labels,
+        )
+
+    def to_disk(self, out_dir: Union[Path, str]):
+        """Persists model to directory on your machine.
+
+        Parameters
+        ----------
+        out_dir: Path | str
+            Directory to save the model to.
+        """
+        out_dir = Path(out_dir)
+        out_dir.mkdir(exist_ok=True)
+        package_versions = get_package_versions()
+        with out_dir.joinpath("package_versions.json").open("w") as ver_file:
+            ver_file.write(json.dumps(package_versions))
+        joblib.dump(self, out_dir.joinpath("model.joblib"))
+
+    def push_to_hub(self, repo_id: str):
+        """Uploads model to HuggingFace Hub
+
+        Parameters
+        ----------
+        repo_id: str
+            Repository to upload the model to.
+        """
+        api = HfApi()
+        api.create_repo(repo_id, exist_ok=True)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            readme_path = Path(tmp_dir).joinpath("README.md")
+            with readme_path.open("w") as readme_file:
+                readme_file.write(create_readme(self, repo_id))
+            self.to_disk(tmp_dir)
+            api.upload_folder(
+                folder_path=tmp_dir,
+                repo_id=repo_id,
+                repo_type="model",
+            )
diff --git a/turftopic/vectorizers/latent_terms/top_k_autoencoder.py b/turftopic/vectorizers/latent_terms/top_k_autoencoder.py
@@ -0,0 +1,121 @@
+"""This is an encode-only implementation of the TopK autoencoder.
+The training code lives in the x-tabdeveloping/latent_terms GitHub repo"""
+
+import warnings
+from functools import partial
+from typing import Optional
+
+import numpy as np
+import scipy.sparse as spr
+from sklearn.base import BaseEstimator, TransformerMixin
+from tqdm import trange
+
+try:
+    import jax.numpy as jnp
+    from jax import jit
+    from jax.lax import top_k
+except ModuleNotFoundError:
+    warnings.warn("JAX not found, continuing with NumPy implementation.")
+    jnp = np
+
+    # Dummy JIT as the identity function
+    def jit(f):
+        return f
+
+    # NumPy implementation of the TopK activation function.
+    def top_k(a, k, *, axis=-1):
+        if axis is None:
+            axis_size = a.size
+        else:
+            axis_size = a.shape[axis]
+        index_array = np.argpartition(a, axis_size - k, axis=axis)
+        topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis)
+        topk_values = np.take_along_axis(a, topk_indices, axis=axis)
+        return topk_values, topk_indices
+
+
+def top_k_activation(z, k: int):
+    values, indices = top_k(z, k=k, axis=-1)
+    threshold = jnp.min(values, axis=-1)
+    condition = threshold[:, None] <= z
+    return jnp.where(condition, z, 0)
+
+
+def encode(params, x, k: int):
+    z = x @ params["W_e"] + params["b_e"]
+    return top_k_activation(z, k)
+
+
+class TopKAutoEncoder(BaseEstimator, TransformerMixin):
+    def __init__(
+        self,
+        n_latent: int = 32768,
+        top_k: int = 16,
+        lr: float = 1e-3,
+        batch_size: int = 4096,
+        n_epochs: int = 10,
+        alpha: float = 0.03,
+        show_progress_bar: bool = True,
+        random_state: Optional[int] = None,
+    ):
+        self.random_state = random_state
+        self.n_latent = n_latent
+        self.lr = lr
+        self.alpha = alpha
+        self.top_k = top_k
+        self.batch_size = batch_size
+        self.n_epochs = n_epochs
+        self.show_progress_bar = show_progress_bar
+
+    def fit(self, X, y=None):
+        # Training is implemented here: https://github.com/x-tabdeveloping/latent_terms
+        return self
+
+    def to_dict(self) -> dict:
+        return dict(
+            attr=self.get_params(),
+            params=self._params,
+            loss_curve=self.loss_curve_,
+        )
+
+    @classmethod
+    def from_dict(cls, data):
+        obj = cls(**data["attr"])
+        params = data["params"]
+        obj.coef_ = np.array(params["W_e"])
+        obj.coef_d_ = np.array(params["W_d"])
+        obj.intercept_ = np.array(params["b_e"])
+        obj.intercept_d_ = np.array(params["b_d"])
+        obj.loss_curve_ = data["loss_curve"]
+        return obj
+
+    @property
+    def _params(self):
+        return {
+            "W_e": self.coef_,
+            "b_e": self.intercept_,
+            "W_d": self.coef_d_,
+            "b_d": self.intercept_d_,
+        }
+
+    def transform(self, X):
+        if spr.issparse(X):
+            X = X.todense()
+        Z = []
+        _encode = jit(partial(encode, params=self._params, k=self.top_k))
+        for batch_start in trange(
+            0,
+            X.shape[0],
+            self.batch_size,
+            leave=False,
+            desc="Going through all batches",
+            disable=not self.show_progress_bar,
+        ):
+            batch_end = batch_start + self.batch_size
+            batch_x = X[batch_start:batch_end]
+            batch_z = _encode(x=batch_x)
+            Z.append(spr.csr_array(batch_z))
+        return spr.vstack(Z, format="csr")
+
+    def fit_transform(self, X, y=None):
+        return self.fit(X, y).transform(X)