From fd1eec1c725b3a26244dbe5ed1d66e4a4cf3c089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos=C3=98?= Date: Tue, 30 Jun 2026 15:02:39 +0200 Subject: [PATCH 1/3] Fixed top documents update in SensTopic --- turftopic/models/senstopic.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/turftopic/models/senstopic.py b/turftopic/models/senstopic.py index e7382f1..b97320f 100644 --- a/turftopic/models/senstopic.py +++ b/turftopic/models/senstopic.py @@ -279,13 +279,11 @@ def partial_fit( *self.topic_names[-n_new_components:], ] console.log("Updated term importances") - self.top_documents.extend( - self.get_top_documents( - raw_documents, - document_topic_matrix=doc_topic[:, -n_new_components:], + for new_dt in doc_topic[:, -n_new_components:].T: + top = np.argsort(-new_dt) + self.top_documents.append( + [raw_documents[i_top] for i_top in top] ) - ) - self.document_topic_matrix = doc_topic console.log("Model update done.") return self From c34d845fbda1c9b7b6b500b7cbbf0125411879a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos=C3=98?= Date: Tue, 30 Jun 2026 16:15:36 +0200 Subject: [PATCH 2/3] Added dynamic online topic modelling to SensTopic, fixed errors --- turftopic/dynamic.py | 4 +- turftopic/models/_snmf.py | 5 +- turftopic/models/senstopic.py | 112 ++++++++++++++++++++++++++-------- 3 files changed, 92 insertions(+), 29 deletions(-) diff --git a/turftopic/dynamic.py b/turftopic/dynamic.py index bd15be6..8d2da8f 100644 --- a/turftopic/dynamic.py +++ b/turftopic/dynamic.py @@ -29,8 +29,8 @@ def bin_timestamps( # Have to substract one, else it starts from one return np.digitize(unix_timestamps, unix_bins) - 1, bins else: - # Adding one day, so that the maximum value is still included. - max_timestamp = max(timestamps) + timedelta(days=1) + # Adding one microsecond, so that the maximum value is still included. + max_timestamp = max(timestamps) + timedelta(microseconds=1) unix_bins = np.histogram_bin_edges(unix_timestamps, bins=bins) unix_bins[-1] = max_timestamp.timestamp() bins = [datetime.fromtimestamp(ts) for ts in unix_bins] diff --git a/turftopic/models/_snmf.py b/turftopic/models/_snmf.py index 96bf84e..22b327e 100644 --- a/turftopic/models/_snmf.py +++ b/turftopic/models/_snmf.py @@ -199,13 +199,14 @@ def fit_timeslice(self, X_t: np.ndarray, G_t: np.ndarray): F = update_F(X_t.T, G_t, F=None) return F.T - def transform(self, X: np.ndarray): + def transform(self, X: np.ndarray, F=None): G = init_G( X.T, n_components=self.n_components, random_state=self.random_state, ) - F = self.components_.T + if F is None: + F = self.components_.T update = jit(lambda G: update_G(X.T, G, F, sparsity=self.sparsity)) error_at_init = rec_err(X.T, F, G) prev_error = error_at_init diff --git a/turftopic/models/senstopic.py b/turftopic/models/senstopic.py index b97320f..3a99246 100644 --- a/turftopic/models/senstopic.py +++ b/turftopic/models/senstopic.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta from functools import partial from typing import Literal, Optional, Union @@ -217,7 +217,7 @@ def update_vocabulary(self, raw_documents): set(new_vectorizer.get_feature_names_out()) - set(old_vocab) ) if len(new_vocab) == 0: - return + return [] new_vocab_embeddings = self.encode_documents(new_vocab) self.vocab_embeddings = np.concatenate( [self.vocab_embeddings, new_vocab_embeddings], axis=0 @@ -225,12 +225,38 @@ def update_vocabulary(self, raw_documents): self.vectorizer.get_feature_names_out = lambda: np.array( list(old_vocab) + new_vocab ) + return new_vocab def partial_fit( - self, raw_documents, y=None, embeddings=None, n_new_components="auto" + self, + raw_documents, + y=None, + embeddings=None, + timestamps=None, + n_new_components="auto", ): + if timestamps is not None: + if (getattr(self, "components_", None) is None) or ( + getattr(self, "time_bin_edges", None) is None + ): + return self.fit_transform_dynamic( + raw_documents, + embeddings=embeddings, + timestamps=timestamps, + bins=1, + ) if getattr(self, "components_", None) is None: - return self.fit(raw_documents, embeddings=embeddings) + if timestamps is None: + return self.fit(raw_documents, embeddings=embeddings) + if timestamps is not None: + last_edge = self.time_bin_edges[-1] + is_before = [(ts <= last_edge) for ts in timestamps] + n_before = np.sum(is_before) + if n_before: + raise ValueError( + "When using partial fitting on a dynamic model, all new documents have to be in a new time slice. " + f"Currently there are {n_before} documents from before {last_edge}. Remove these before fitting." + ) console = Console() with console.status("Updating model with new data") as status: if embeddings is None: @@ -253,10 +279,11 @@ def partial_fit( ) self.n_components_ = self.decomposition.n_components doc_topic = self.decomposition.transform(embeddings) - console.log("Updated model") + console.log(f"Updated model with {n_new_components} topics.") status.update("Updating vocabulary") - self.update_vocabulary(raw_documents) - console.log("Updated vocabulary") + new_vocab = self.update_vocabulary(raw_documents) + n_new_vocab = len(new_vocab) + console.log(f"Updated vocabulary with {n_new_vocab} items.") status.update("Estimating term importances") vocab_topic = self.decomposition.transform(self.vocab_embeddings) self.axial_components_ = vocab_topic.T @@ -284,6 +311,36 @@ def partial_fit( self.top_documents.append( [raw_documents[i_top] for i_top in top] ) + if timestamps is not None: + status.update("Updating temporal components.") + self.time_bin_edges.append( + max(timestamps) + timedelta(microseconds=1) + ) + t_components = [] + t_importance = [] + for t_component, t_imp in zip( + self.axial_temporal_components_, self.temporal_importance_ + ): + t_component = np.pad( + t_component, + [(0, n_new_components), (0, n_new_vocab)], + mode="constant", + constant_values=0, + ) + t_imp = np.pad( + t_imp, + (0, n_new_components), + mode="constant", + constant_values=0, + ) + t_components.append(t_component) + t_importance.append(t_imp) + new_imp, new_comp = self._fit_timebin(embeddings, doc_topic) + t_components.append(new_comp) + t_importance.append(new_imp) + self.axial_temporal_components_ = np.stack(t_components) + self.temporal_importance_ = np.stack(t_importance) + self.estimate_components(self.feature_importance) console.log("Model update done.") return self @@ -371,6 +428,13 @@ def fit_transform_multimodal( console.log("Images transformed") return doc_topic + def _fit_timebin(self, t_X, t_dt): + t_imp = t_dt.mean(axis=0) + t_F = self.decomposition.fit_timeslice(t_X, t_dt).T + t_G = self.decomposition.transform(self.vocab_embeddings, F=t_F) + t_components_ = t_G.T + return t_imp, t_components_ + def fit_transform_dynamic( self, raw_documents, @@ -378,9 +442,14 @@ def fit_transform_dynamic( embeddings: Optional[np.ndarray] = None, bins: Union[int, list[datetime]] = 10, ) -> np.ndarray: - document_topic_matrix = self.fit_transform( - raw_documents, embeddings=embeddings - ) + if getattr(self, "components_", None) is None: + document_topic_matrix = self.fit_transform( + raw_documents, embeddings=embeddings + ) + else: + document_topic_matrix = self.transform( + raw_documents, embeddings=embeddings + ) time_labels, self.time_bin_edges = self.bin_timestamps( timestamps, bins ) @@ -392,22 +461,15 @@ def fit_transform_dynamic( dtype=self.components_.dtype, ) self.temporal_importance_ = np.zeros((n_bins, n_comp)) - # doc_topic = np.dot(X, self.components_.T) for i_timebin in np.unique(time_labels): - topic_importances = document_topic_matrix[ - time_labels == i_timebin - ].mean(axis=0) - self.temporal_importance_[i_timebin, :] = topic_importances - t_doc_topic = document_topic_matrix[time_labels == i_timebin] - t_embeddings = self.embeddings[time_labels == i_timebin] - t_components = self.decomposition.fit_timeslice( - t_embeddings, t_doc_topic - ) - ax_t = np.maximum( - self.vocab_embeddings @ np.linalg.pinv(t_components), 0 - ) - self.axial_temporal_components_[i_timebin, :, :] = ax_t.T - self.estimate_components(self.feature_importance) + t_dt = document_topic_matrix[time_labels == i_timebin] + t_X = self.embeddings[time_labels == i_timebin] + t_imp, t_comp = self._fit_timebin(t_X, t_dt) + self.temporal_importance_[i_timebin, :] = t_imp + self.axial_temporal_components_[i_timebin, :, :] = t_comp + self.estimate_components( + self.feature_importance, + ) return document_topic_matrix @property From 0d18d2beb361b2ee82c15b6e6e990c775af5afcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos=C3=98?= Date: Tue, 30 Jun 2026 16:16:29 +0200 Subject: [PATCH 3/3] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1269899..eded9c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ profile = "black" [project] name = "turftopic" -version = "0.26.0" +version = "0.26.1" description = "Topic modeling with contextual representations from sentence transformers." authors = [ { name = "Márton Kardos ", email = "martonkardos@cas.au.dk" }