diff --git a/pyproject.toml b/pyproject.toml index c268f25d..e2110519 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ profile = "black" [project] name = "turftopic" -version = "0.25.2" +version = "0.25.3" description = "Topic modeling with contextual representations from sentence transformers." authors = [ { name = "Márton Kardos ", email = "martonkardos@cas.au.dk" } diff --git a/turftopic/late.py b/turftopic/late.py index a3423bf3..03f7d248 100644 --- a/turftopic/late.py +++ b/turftopic/late.py @@ -1,8 +1,10 @@ import itertools import warnings +from functools import partial from typing import Callable, Iterable, Optional, Union import numpy as np +import scipy.sparse as spr import torch from sentence_transformers import SentenceTransformer from sklearn.base import TransformerMixin @@ -208,29 +210,37 @@ def unflatten_repr( return repr -def pool_flat(flat_repr: np.ndarray, lengths: Lengths, agg=np.nanmean): +def pool_flat( + flat_repr: np.ndarray | spr.sparray, lengths: Lengths, agg=np.nanmean +): """Pools vectors within documents using the agg function. Parameters ---------- - flat_repr: ndarray of shape (n_total_tokens, n_dims) + flat_repr: ndarray or sparse array of shape (n_total_tokens, n_dims) Flattened document representations. lengths: Lengths Number of tokens in each document. Returns ------- - ndarray of shape (n_documents, n_dims) + ndarray or sparse array of shape (n_documents, n_dims) Pooled representation for each document. """ + if spr.issparse(flat_repr): + stack = partial(spr.vstack, format="csr") + array = spr.csr_matrix + else: + stack = np.stack + array = np.asarray pooled = [] start_index = 0 for length in lengths: pooled.append( - agg(flat_repr[start_index : start_index + length], axis=0) + array(agg(flat_repr[start_index : start_index + length], axis=0)) ) start_index += length - return np.stack(pooled) + return stack(pooled) def get_document_chunks( diff --git a/turftopic/retrieval/__init__.py b/turftopic/retrieval/__init__.py new file mode 100644 index 00000000..4759742d --- /dev/null +++ b/turftopic/retrieval/__init__.py @@ -0,0 +1 @@ +from .bm25 import BM25Transformer diff --git a/turftopic/retrieval/bm25.py b/turftopic/retrieval/bm25.py new file mode 100644 index 00000000..8c100c78 --- /dev/null +++ b/turftopic/retrieval/bm25.py @@ -0,0 +1,32 @@ +import numpy as np +import scipy.sparse as spr +from sklearn.base import BaseEstimator, TransformerMixin + + +class BM25Transformer(BaseEstimator, TransformerMixin): + def __init__(self, b: float = 0.7, k1: float = 8): + self.b = b + self.k1 = k1 + + def fit(self, X, y=None): + self.N_ = X.shape[0] + self.avgdl_ = X.sum(axis=1).mean() + self.term_freq_ = np.ravel(np.asarray((X > 0).sum(axis=0))) + self.idf_ = np.log( + (self.N_ - self.term_freq_ + 0.5) / (self.term_freq_ + 0.5) + ) + return self + + def transform(self, X): + if spr.issparse(X): + X = spr.csr_array(X) + d_len = np.ravel(np.asarray(X.sum(axis=1))) + K_D = 1 - self.b + self.b * d_len / self.avgdl_ + return ( + self.idf_[None, :] + * (X * (self.k1 + 1)) + / (X + self.k1 * K_D[:, None]) + ) + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) diff --git a/turftopic/serialization.py b/turftopic/serialization.py index fa1ced19..8336a4ed 100644 --- a/turftopic/serialization.py +++ b/turftopic/serialization.py @@ -77,7 +77,11 @@ def validate_package_versions(remote_versions: dict[str, str]): def create_readme(model, model_path: str) -> str: model_structure = str(model) - topics_table = model.export_topics(format="markdown", top_k=10) + try: + topics_table = model.export_topics(format="markdown", top_k=10) + except Exception: + print("Couldn't produce topic table for readme, moving on...") + topics_table = None local_versions = get_package_versions() lines = ["| Package | Version |", "| - | - |"] for package in IMPORTANT_PACKAGES: diff --git a/turftopic/vectorizers/__init__.py b/turftopic/vectorizers/__init__.py index e69de29b..e940264a 100644 --- a/turftopic/vectorizers/__init__.py +++ b/turftopic/vectorizers/__init__.py @@ -0,0 +1,3 @@ +from turftopic.vectorizers.latent_terms.latent_terms import ( + LatentTermsVectorizer, +) diff --git a/turftopic/vectorizers/latent_terms/latent_terms.py b/turftopic/vectorizers/latent_terms/latent_terms.py new file mode 100644 index 00000000..a0cb14f9 --- /dev/null +++ b/turftopic/vectorizers/latent_terms/latent_terms.py @@ -0,0 +1,112 @@ +import json +import tempfile +from pathlib import Path +from typing import Union + +import joblib +import numpy as np +from huggingface_hub import HfApi +from sklearn.base import BaseEstimator, TransformerMixin + +from turftopic.late import ( + LateSentenceTransformer, + flatten_repr, + pool_flat, +) +from turftopic.serialization import create_readme, get_package_versions +from turftopic.vectorizers.latent_terms.top_k_autoencoder import ( + TopKAutoEncoder, +) + + +class LatentTermsVectorizer(BaseEstimator, TransformerMixin): + def __init__( + self, + encoder: str | LateSentenceTransformer, + autoencoder: TopKAutoEncoder, + concept_labels: np.ndarray, + show_progress_bar: bool = True, + ): + self.encoder = encoder + if isinstance(self.encoder, str): + self._encoder = LateSentenceTransformer(self.encoder) + else: + self._encoder = self.encoder + self.concept_labels = np.array(concept_labels) + self.autoencoder = autoencoder + self.show_progress_bar = show_progress_bar + self.autoencoder.show_progress_bar = show_progress_bar + + def fit(self, raw_documents, y=None): + # Does nothing, for compatibility + return self + + def transform(self, raw_documents): + token_embeddings, offsets = self._encoder.encode_tokens( + list(raw_documents), show_progress_bar=self.show_progress_bar + ) + flat_token_embeddings, lengths = flatten_repr(token_embeddings) + flat_z = self.autoencoder.transform(flat_token_embeddings) + # Pooling procedure from section 3.2 + pooled_z = pool_flat(flat_z, lengths=lengths, agg=np.sum) + return np.sqrt(pooled_z) + + def fit_transform(self, raw_documents, y=None): + return self.fit(raw_documents, y).transform(raw_documents) + + def get_feature_names_out(self): + return self.concept_labels + + @classmethod + def from_dict(cls, data): + autoencoder = TopKAutoEncoder.from_dict(data["autoencoder"]) + return cls( + encoder=data["encoder"], + autoencoder=autoencoder, + show_progress_bar=data["show_progress_bar"], + concept_labels=data["concept_labels"], + ) + + def to_dict(self): + return dict( + encoder=self.encoder, + autoencoder=self.autoencoder.to_dict(), + show_progress_bar=self.show_progress_bar, + concept_labels=self.concept_labels, + ) + + def to_disk(self, out_dir: Union[Path, str]): + """Persists model to directory on your machine. + + Parameters + ---------- + out_dir: Path | str + Directory to save the model to. + """ + out_dir = Path(out_dir) + out_dir.mkdir(exist_ok=True) + package_versions = get_package_versions() + with out_dir.joinpath("package_versions.json").open("w") as ver_file: + ver_file.write(json.dumps(package_versions)) + joblib.dump(self, out_dir.joinpath("model.joblib")) + + def push_to_hub(self, repo_id: str): + """Uploads model to HuggingFace Hub + + Parameters + ---------- + repo_id: str + Repository to upload the model to. + """ + api = HfApi() + api.create_repo(repo_id, exist_ok=True) + with tempfile.TemporaryDirectory() as tmp_dir: + readme_path = Path(tmp_dir).joinpath("README.md") + with readme_path.open("w") as readme_file: + readme_file.write(create_readme(self, repo_id)) + self.to_disk(tmp_dir) + api.upload_folder( + folder_path=tmp_dir, + repo_id=repo_id, + repo_type="model", + ) diff --git a/turftopic/vectorizers/latent_terms/top_k_autoencoder.py b/turftopic/vectorizers/latent_terms/top_k_autoencoder.py new file mode 100644 index 00000000..fcaf9601 --- /dev/null +++ b/turftopic/vectorizers/latent_terms/top_k_autoencoder.py @@ -0,0 +1,121 @@ +"""This is an encode-only implementation of the TopK autoencoder. +The training code lives in the x-tabdeveloping/latent_terms GitHub repo""" + +import warnings +from functools import partial +from typing import Optional + +import numpy as np +import scipy.sparse as spr +from sklearn.base import BaseEstimator, TransformerMixin +from tqdm import trange + +try: + import jax.numpy as jnp + from jax import jit + from jax.lax import top_k +except ModuleNotFoundError: + warnings.warn("JAX not found, continuing with NumPy implementation.") + jnp = np + + # Dummy JIT as the identity function + def jit(f): + return f + + # NumPy implementation of the TopK activation function. + def top_k(a, k, *, axis=-1): + if axis is None: + axis_size = a.size + else: + axis_size = a.shape[axis] + index_array = np.argpartition(a, axis_size - k, axis=axis) + topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis) + topk_values = np.take_along_axis(a, topk_indices, axis=axis) + return topk_values, topk_indices + + +def top_k_activation(z, k: int): + values, indices = top_k(z, k=k, axis=-1) + threshold = jnp.min(values, axis=-1) + condition = threshold[:, None] <= z + return jnp.where(condition, z, 0) + + +def encode(params, x, k: int): + z = x @ params["W_e"] + params["b_e"] + return top_k_activation(z, k) + + +class TopKAutoEncoder(BaseEstimator, TransformerMixin): + def __init__( + self, + n_latent: int = 32768, + top_k: int = 16, + lr: float = 1e-3, + batch_size: int = 4096, + n_epochs: int = 10, + alpha: float = 0.03, + show_progress_bar: bool = True, + random_state: Optional[int] = None, + ): + self.random_state = random_state + self.n_latent = n_latent + self.lr = lr + self.alpha = alpha + self.top_k = top_k + self.batch_size = batch_size + self.n_epochs = n_epochs + self.show_progress_bar = show_progress_bar + + def fit(self, X, y=None): + # Training is implemented here: https://github.com/x-tabdeveloping/latent_terms + return self + + def to_dict(self) -> dict: + return dict( + attr=self.get_params(), + params=self._params, + loss_curve=self.loss_curve_, + ) + + @classmethod + def from_dict(cls, data): + obj = cls(**data["attr"]) + params = data["params"] + obj.coef_ = np.array(params["W_e"]) + obj.coef_d_ = np.array(params["W_d"]) + obj.intercept_ = np.array(params["b_e"]) + obj.intercept_d_ = np.array(params["b_d"]) + obj.loss_curve_ = data["loss_curve"] + return obj + + @property + def _params(self): + return { + "W_e": self.coef_, + "b_e": self.intercept_, + "W_d": self.coef_d_, + "b_d": self.intercept_d_, + } + + def transform(self, X): + if spr.issparse(X): + X = X.todense() + Z = [] + _encode = jit(partial(encode, params=self._params, k=self.top_k)) + for batch_start in trange( + 0, + X.shape[0], + self.batch_size, + leave=False, + desc="Going through all batches", + disable=not self.show_progress_bar, + ): + batch_end = batch_start + self.batch_size + batch_x = X[batch_start:batch_end] + batch_z = _encode(x=batch_x) + Z.append(spr.csr_array(batch_z)) + return spr.vstack(Z, format="csr") + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X)