Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ profile = "black"

[project]
name = "turftopic"
version = "0.25.2"
version = "0.25.3"
description = "Topic modeling with contextual representations from sentence transformers."
authors = [
{ name = "Márton Kardos <power.up1163@gmail.com>", email = "martonkardos@cas.au.dk" }
Expand Down
20 changes: 15 additions & 5 deletions turftopic/late.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import itertools
import warnings
from functools import partial
from typing import Callable, Iterable, Optional, Union

import numpy as np
import scipy.sparse as spr
import torch
from sentence_transformers import SentenceTransformer
from sklearn.base import TransformerMixin
Expand Down Expand Up @@ -208,29 +210,37 @@ def unflatten_repr(
return repr


def pool_flat(flat_repr: np.ndarray, lengths: Lengths, agg=np.nanmean):
def pool_flat(
flat_repr: np.ndarray | spr.sparray, lengths: Lengths, agg=np.nanmean
):
"""Pools vectors within documents using the agg function.

Parameters
----------
flat_repr: ndarray of shape (n_total_tokens, n_dims)
flat_repr: ndarray or sparse array of shape (n_total_tokens, n_dims)
Flattened document representations.
lengths: Lengths
Number of tokens in each document.

Returns
-------
ndarray of shape (n_documents, n_dims)
ndarray or sparse array of shape (n_documents, n_dims)
Pooled representation for each document.
"""
if spr.issparse(flat_repr):
stack = partial(spr.vstack, format="csr")
array = spr.csr_matrix
else:
stack = np.stack
array = np.asarray
pooled = []
start_index = 0
for length in lengths:
pooled.append(
agg(flat_repr[start_index : start_index + length], axis=0)
array(agg(flat_repr[start_index : start_index + length], axis=0))
)
start_index += length
return np.stack(pooled)
return stack(pooled)


def get_document_chunks(
Expand Down
1 change: 1 addition & 0 deletions turftopic/retrieval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .bm25 import BM25Transformer
32 changes: 32 additions & 0 deletions turftopic/retrieval/bm25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import numpy as np
import scipy.sparse as spr
from sklearn.base import BaseEstimator, TransformerMixin


class BM25Transformer(BaseEstimator, TransformerMixin):
def __init__(self, b: float = 0.7, k1: float = 8):
self.b = b
self.k1 = k1

def fit(self, X, y=None):
self.N_ = X.shape[0]
self.avgdl_ = X.sum(axis=1).mean()
self.term_freq_ = np.ravel(np.asarray((X > 0).sum(axis=0)))
self.idf_ = np.log(
(self.N_ - self.term_freq_ + 0.5) / (self.term_freq_ + 0.5)
)
return self

def transform(self, X):
if spr.issparse(X):
X = spr.csr_array(X)
d_len = np.ravel(np.asarray(X.sum(axis=1)))
K_D = 1 - self.b + self.b * d_len / self.avgdl_
return (
self.idf_[None, :]
* (X * (self.k1 + 1))
/ (X + self.k1 * K_D[:, None])
)

def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
6 changes: 5 additions & 1 deletion turftopic/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,11 @@ def validate_package_versions(remote_versions: dict[str, str]):

def create_readme(model, model_path: str) -> str:
model_structure = str(model)
topics_table = model.export_topics(format="markdown", top_k=10)
try:
topics_table = model.export_topics(format="markdown", top_k=10)
except Exception:
print("Couldn't produce topic table for readme, moving on...")
topics_table = None
local_versions = get_package_versions()
lines = ["| Package | Version |", "| - | - |"]
for package in IMPORTANT_PACKAGES:
Expand Down
3 changes: 3 additions & 0 deletions turftopic/vectorizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from turftopic.vectorizers.latent_terms.latent_terms import (
LatentTermsVectorizer,
)
112 changes: 112 additions & 0 deletions turftopic/vectorizers/latent_terms/latent_terms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import json
import tempfile
from pathlib import Path
from typing import Union

import joblib
import numpy as np
from huggingface_hub import HfApi
from sklearn.base import BaseEstimator, TransformerMixin

from turftopic.late import (
LateSentenceTransformer,
flatten_repr,
pool_flat,
)
from turftopic.serialization import create_readme, get_package_versions
from turftopic.vectorizers.latent_terms.top_k_autoencoder import (
TopKAutoEncoder,
)


class LatentTermsVectorizer(BaseEstimator, TransformerMixin):
def __init__(
self,
encoder: str | LateSentenceTransformer,
autoencoder: TopKAutoEncoder,
concept_labels: np.ndarray,
show_progress_bar: bool = True,
):
self.encoder = encoder
if isinstance(self.encoder, str):
self._encoder = LateSentenceTransformer(self.encoder)
else:
self._encoder = self.encoder
self.concept_labels = np.array(concept_labels)
self.autoencoder = autoencoder
self.show_progress_bar = show_progress_bar
self.autoencoder.show_progress_bar = show_progress_bar

def fit(self, raw_documents, y=None):
# Does nothing, for compatibility
return self

def transform(self, raw_documents):
token_embeddings, offsets = self._encoder.encode_tokens(
list(raw_documents), show_progress_bar=self.show_progress_bar
)
flat_token_embeddings, lengths = flatten_repr(token_embeddings)
flat_z = self.autoencoder.transform(flat_token_embeddings)
# Pooling procedure from section 3.2
pooled_z = pool_flat(flat_z, lengths=lengths, agg=np.sum)
return np.sqrt(pooled_z)

def fit_transform(self, raw_documents, y=None):
return self.fit(raw_documents, y).transform(raw_documents)

def get_feature_names_out(self):
return self.concept_labels

@classmethod
def from_dict(cls, data):
autoencoder = TopKAutoEncoder.from_dict(data["autoencoder"])
return cls(
encoder=data["encoder"],
autoencoder=autoencoder,
show_progress_bar=data["show_progress_bar"],
concept_labels=data["concept_labels"],
)

def to_dict(self):
return dict(
encoder=self.encoder,
autoencoder=self.autoencoder.to_dict(),
show_progress_bar=self.show_progress_bar,
concept_labels=self.concept_labels,
)

def to_disk(self, out_dir: Union[Path, str]):
"""Persists model to directory on your machine.

Parameters
----------
out_dir: Path | str
Directory to save the model to.
"""
out_dir = Path(out_dir)
out_dir.mkdir(exist_ok=True)
package_versions = get_package_versions()
with out_dir.joinpath("package_versions.json").open("w") as ver_file:
ver_file.write(json.dumps(package_versions))
joblib.dump(self, out_dir.joinpath("model.joblib"))

def push_to_hub(self, repo_id: str):
"""Uploads model to HuggingFace Hub

Parameters
----------
repo_id: str
Repository to upload the model to.
"""
api = HfApi()
api.create_repo(repo_id, exist_ok=True)
with tempfile.TemporaryDirectory() as tmp_dir:
readme_path = Path(tmp_dir).joinpath("README.md")
with readme_path.open("w") as readme_file:
readme_file.write(create_readme(self, repo_id))
self.to_disk(tmp_dir)
api.upload_folder(
folder_path=tmp_dir,
repo_id=repo_id,
repo_type="model",
)
121 changes: 121 additions & 0 deletions turftopic/vectorizers/latent_terms/top_k_autoencoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""This is an encode-only implementation of the TopK autoencoder.
The training code lives in the x-tabdeveloping/latent_terms GitHub repo"""

import warnings
from functools import partial
from typing import Optional

import numpy as np
import scipy.sparse as spr
from sklearn.base import BaseEstimator, TransformerMixin
from tqdm import trange

try:
import jax.numpy as jnp
from jax import jit
from jax.lax import top_k
except ModuleNotFoundError:
warnings.warn("JAX not found, continuing with NumPy implementation.")
jnp = np

# Dummy JIT as the identity function
def jit(f):
return f

# NumPy implementation of the TopK activation function.
def top_k(a, k, *, axis=-1):
if axis is None:
axis_size = a.size
else:
axis_size = a.shape[axis]
index_array = np.argpartition(a, axis_size - k, axis=axis)
topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis)
topk_values = np.take_along_axis(a, topk_indices, axis=axis)
return topk_values, topk_indices


def top_k_activation(z, k: int):
values, indices = top_k(z, k=k, axis=-1)
threshold = jnp.min(values, axis=-1)
condition = threshold[:, None] <= z
return jnp.where(condition, z, 0)


def encode(params, x, k: int):
z = x @ params["W_e"] + params["b_e"]
return top_k_activation(z, k)


class TopKAutoEncoder(BaseEstimator, TransformerMixin):
def __init__(
self,
n_latent: int = 32768,
top_k: int = 16,
lr: float = 1e-3,
batch_size: int = 4096,
n_epochs: int = 10,
alpha: float = 0.03,
show_progress_bar: bool = True,
random_state: Optional[int] = None,
):
self.random_state = random_state
self.n_latent = n_latent
self.lr = lr
self.alpha = alpha
self.top_k = top_k
self.batch_size = batch_size
self.n_epochs = n_epochs
self.show_progress_bar = show_progress_bar

def fit(self, X, y=None):
# Training is implemented here: https://github.com/x-tabdeveloping/latent_terms
return self

def to_dict(self) -> dict:
return dict(
attr=self.get_params(),
params=self._params,
loss_curve=self.loss_curve_,
)

@classmethod
def from_dict(cls, data):
obj = cls(**data["attr"])
params = data["params"]
obj.coef_ = np.array(params["W_e"])
obj.coef_d_ = np.array(params["W_d"])
obj.intercept_ = np.array(params["b_e"])
obj.intercept_d_ = np.array(params["b_d"])
obj.loss_curve_ = data["loss_curve"]
return obj

@property
def _params(self):
return {
"W_e": self.coef_,
"b_e": self.intercept_,
"W_d": self.coef_d_,
"b_d": self.intercept_d_,
}

def transform(self, X):
if spr.issparse(X):
X = X.todense()
Z = []
_encode = jit(partial(encode, params=self._params, k=self.top_k))
for batch_start in trange(
0,
X.shape[0],
self.batch_size,
leave=False,
desc="Going through all batches",
disable=not self.show_progress_bar,
):
batch_end = batch_start + self.batch_size
batch_x = X[batch_start:batch_end]
batch_z = _encode(x=batch_x)
Z.append(spr.csr_array(batch_z))
return spr.vstack(Z, format="csr")

def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
Loading