diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index a60b7acc60b..b35a22a8b66 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -37,6 +37,7 @@ class VideoStats:
     max_height: int
     max_channels: int
     num_instances: int
+    num_total_instances: int
 
     @property
     def output_shape(self):
@@ -132,8 +133,20 @@ def get_stats(self, source_path: str):
             max_height = max(max_height, height)
             max_num_channels = max(max_num_channels, num_channels)
             num_instances += 1
+        num_total_instances = num_instances
+        num_instances = (
+            min(num_instances, self.chunk_size)
+            if self.chunk_size is not None
+            else num_instances
+        )
         return VideoStats(
-            fps, max_length, max_width, max_height, max_num_channels, num_instances
+            fps,
+            max_length,
+            max_width,
+            max_height,
+            max_num_channels,
+            num_instances,
+            num_total_instances,
         )
 
     def estimate_peak_memory_bytes(self) -> dict:
diff --git a/src/main/python/systemds/scuro/drsearch/modality_shared_memory.py b/src/main/python/systemds/scuro/drsearch/modality_shared_memory.py
index a98592b6204..d4092b90cfc 100644
--- a/src/main/python/systemds/scuro/drsearch/modality_shared_memory.py
+++ b/src/main/python/systemds/scuro/drsearch/modality_shared_memory.py
@@ -401,3 +401,26 @@ def add_shared_memory_candidate(data: Any, resident_bytes: int = 0) -> bool:
             return data, shm.name, data_nbytes, resident_bytes
 
     return None, None, 0, resident_bytes
+
+
+_SHARED_MEMORY_WRAPPER_TYPES = (
+    SharedStringList,
+    SharedGroupedArrayList,
+    SharedArrayList,
+    SharedNDArray,
+)
+
+
+def collect_shm_names_from_payload(data: Any) -> List[str]:
+    if data is None:
+        return []
+    if isinstance(data, _SHARED_MEMORY_WRAPPER_TYPES):
+        return [data.shm_name]
+    if hasattr(data, "data"):
+        return collect_shm_names_from_payload(data.data)
+    if isinstance(data, (list, tuple)):
+        names: List[str] = []
+        for item in data:
+            names.extend(collect_shm_names_from_payload(item))
+        return names
+    return []
diff --git a/src/main/python/systemds/scuro/drsearch/node_executor.py b/src/main/python/systemds/scuro/drsearch/node_executor.py
index 418b11fe700..4b9b2acc658 100644
--- a/src/main/python/systemds/scuro/drsearch/node_executor.py
+++ b/src/main/python/systemds/scuro/drsearch/node_executor.py
@@ -23,13 +23,17 @@
 import os
 from multiprocessing import shared_memory
 from systemds.scuro import Modality
-from systemds.scuro.drsearch.modality_shared_memory import add_shared_memory_candidate
+from systemds.scuro.drsearch.modality_shared_memory import (
+    add_shared_memory_candidate,
+    collect_shm_names_from_payload,
+)
 from systemds.scuro.drsearch.node_scheduler import MemoryAwareNodeScheduler
 from systemds.scuro.drsearch.representation_dag import (
     RepresentationDag,
     RepresentationNode,
 )
 
+import threading
 import numpy as np
 from typing import Any, Dict, List, Optional
 import multiprocessing as mp
@@ -85,6 +89,7 @@ def __init__(self):
         self.ref_count = {}
         self.memory_usage_per_node = {}
         self.shared_memory_names = {}
+        self._shm_retain_count: Dict[str, int] = {}
 
     def get(self, node_id: str) -> Any:
         return self.cache[node_id]
@@ -125,21 +130,44 @@ def inc_ref(self, node_id: str):
         self.ref_count[node_id] += 1
 
     def dec_ref(self, node_id: str):
+        if node_id not in self.ref_count:
+            return
         self.ref_count[node_id] -= 1
-        if self.ref_count[node_id] == 0:
-            del self.cache[node_id]
-            del self.ref_count[node_id]
-            del self.memory_usage_per_node[node_id]
-            self._cleanup_shared_memory(node_id)
+        if self.ref_count[node_id] <= 0:
+            self.ref_count[node_id] = 0
+            self._try_cleanup_node(node_id)
 
     def clear(self, node_id: str):
-        if node_id in self.cache:
-            del self.cache[node_id]
         if node_id in self.ref_count:
-            del self.ref_count[node_id]
-        if node_id in self.memory_usage_per_node:
-            del self.memory_usage_per_node[node_id]
-        self._cleanup_shared_memory(node_id)
+            self.ref_count[node_id] = 0
+        self._try_cleanup_node(node_id)
+
+    def retain_shm_names(self, shm_names: List[str]) -> List[str]:
+        retained: List[str] = []
+        for shm_name in shm_names:
+            if not shm_name:
+                continue
+            self._shm_retain_count[shm_name] = (
+                self._shm_retain_count.get(shm_name, 0) + 1
+            )
+            retained.append(shm_name)
+        return retained
+
+    def release_shm_names(self, shm_names: List[str]) -> None:
+        nodes_to_recheck: List[str] = []
+        for shm_name in shm_names:
+            if not shm_name:
+                continue
+            count = self._shm_retain_count.get(shm_name, 0) - 1
+            if count <= 0:
+                self._shm_retain_count.pop(shm_name, None)
+            else:
+                self._shm_retain_count[shm_name] = count
+            for node_id, node_names in self.shared_memory_names.items():
+                if shm_name in node_names and node_id not in nodes_to_recheck:
+                    nodes_to_recheck.append(node_id)
+        for node_id in nodes_to_recheck:
+            self._try_cleanup_node(node_id)
 
     def __len__(self):
         return len(self.cache)
@@ -147,6 +175,20 @@ def __len__(self):
     def get_memory_total_memory_usage(self):
         return sum(self.memory_usage_per_node.values())
 
+    def _shm_names_in_use(self, shm_names: List[str]) -> bool:
+        return any(self._shm_retain_count.get(name, 0) > 0 for name in shm_names)
+
+    def _try_cleanup_node(self, node_id: str) -> None:
+        if self.ref_count.get(node_id, 0) > 0:
+            return
+        shm_names = self.shared_memory_names.get(node_id, [])
+        if shm_names and self._shm_names_in_use(shm_names):
+            return
+        self.cache.pop(node_id, None)
+        self.ref_count.pop(node_id, None)
+        self.memory_usage_per_node.pop(node_id, None)
+        self._cleanup_shared_memory(node_id)
+
     def _cleanup_shared_memory(self, node_id: str):
         names = self.shared_memory_names.pop(node_id, [])
         for shm_name in names:
@@ -160,10 +202,37 @@ def _cleanup_shared_memory(self, node_id: str):
                 pass
 
     def cleanup_all(self):
+        self._shm_retain_count.clear()
         for node_id in list(self.shared_memory_names.keys()):
+            self.ref_count.pop(node_id, None)
+            self.cache.pop(node_id, None)
+            self.memory_usage_per_node.pop(node_id, None)
             self._cleanup_shared_memory(node_id)
 
 
+def _execute_multiple_reps_for_leaf_dependencies(
+    nodes: List[RepresentationNode],
+    modalities: List[Modality],
+    gpu_id: Optional[int],
+):
+    representations = []
+    node_id_by_representation = {}
+    for node in nodes:
+        operation = node.operation(params=node.parameters)
+        if hasattr(operation, "gpu_id"):
+            operation.gpu_id = gpu_id
+        representations.append(operation)
+        node_id_by_representation[operation.name] = node.node_id
+
+    modality_results = modalities[0].apply_representations(
+        representations, parallel=True
+    )
+    return {
+        "results": modality_results,
+        "node_id_by_representation": node_id_by_representation,
+    }
+
+
 def _execute_node_worker(node, input_mods, task, rep_cache, gpu_id):
     if gpu_id is not None:
         device = torch.device(f"cuda:{gpu_id}")
@@ -205,14 +274,19 @@ def _run_node_op():
                 )
             return input_mods[0].combine(input_mods[1:], fusion_op)
 
-    result, peak_delta_bytes, peak_abs_rss = measure_peak_rss_during(
-        _run_node_op,
-        sample_s=0.01,
-    )
-
-    gpu_peak_bytes = (
-        torch.cuda.max_memory_allocated(device) if gpu_id is not None else 0
-    )
+    gpu_peak_bytes = -1
+    peak_delta_bytes = -1
+    peak_abs_rss = -1
+    if DEBUG:
+        result, peak_delta_bytes, peak_abs_rss = measure_peak_rss_during(
+            _run_node_op,
+            sample_s=0.01,
+        )
+        gpu_peak_bytes = (
+            torch.cuda.max_memory_allocated(device) if gpu_id is not None else 0
+        )
+    else:
+        result = _run_node_op()
 
     return {
         "result": result,
@@ -224,7 +298,10 @@ def _run_node_op():
 
 
 def _execute_task_worker(
-    task_node_id: str, task: Any, data: Any, gpu_id: Optional[int]
+    task_node_id: str,
+    task: Any,
+    data: Any,
+    gpu_id: Optional[int],
 ) -> Dict[str, Any]:
 
     if DEBUG:
@@ -243,17 +320,23 @@ def _run_task():
         end = time.perf_counter()
         return scores, end - start
 
-    gpu_peak_bytes = (
-        torch.cuda.max_memory_allocated(device) if gpu_id is not None else 0
-    )
-    result, peak_delta_bytes, peak_abs_rss = measure_peak_rss_during(
-        _run_task,
-        sample_s=0.01,
-    )
+    gpu_peak_bytes = -1
+    peak_delta_bytes = -1
     if DEBUG:
+        gpu_peak_bytes = (
+            torch.cuda.max_memory_allocated(device) if gpu_id is not None else 0
+        )
+        result, peak_delta_bytes, peak_abs_rss = measure_peak_rss_during(
+            _run_task,
+            sample_s=0.01,
+        )
+
         print(
             f"Task {task_node_id} has a CPU peak memory usage of {peak_delta_bytes/1024**3:.2f} GB, and a GPU peak memory usage of {gpu_peak_bytes/1024**3:.2f} GB"
         )
+    else:
+        result = _run_task()
+
     return {
         "scores": result[0],
         "task_time": result[1],
@@ -303,6 +386,31 @@ def __init__(
             resume=False,
         )
 
+    def _shm_names_for_submit(
+        self, parent_node_ids: List[str], payload_data: Any
+    ) -> List[str]:
+        names: List[str] = []
+        for parent_id in parent_node_ids or []:
+            names.extend(self.result_cache.shared_memory_names.get(parent_id, []))
+        if parent_node_ids:
+            names.extend(collect_shm_names_from_payload(payload_data))
+        else:
+            names.extend(getattr(self, "_leaf_shm_names", []))
+            names.extend(collect_shm_names_from_payload(payload_data))
+        # preserve order, drop duplicates
+        return list(dict.fromkeys(names))
+
+    def _retain_for_submit(
+        self, parent_node_ids: List[str], payload_data: Any
+    ) -> List[str]:
+        return self.result_cache.retain_shm_names(
+            self._shm_names_for_submit(parent_node_ids, payload_data)
+        )
+
+    def _release_for_future(self, retained_shm_names: List[str]) -> None:
+        if retained_shm_names:
+            self.result_cache.release_shm_names(retained_shm_names)
+
     def run(self) -> None:
         task_results = {}
         memory_usage_data = {}
@@ -314,6 +422,22 @@ def run(self) -> None:
             max_workers=self.max_num_workers, mp_context=ctx
         ) as executor:
             future_to_node_id = {}
+            future_to_retained_shm: Dict[Any, List[str]] = {}
+
+            def submit_nodes_with_leaf_dependencies(node_ids: List[str]):
+                nodes = [self.scheduler.mapping[node_id] for node_id in node_ids]
+                gpu_id = nodes[0].gpu_id
+                self.scheduler.move_to_running(node_ids)
+
+                retained = self._retain_for_submit([], self.modalities[0].data)
+                future = executor.submit(
+                    _execute_multiple_reps_for_leaf_dependencies,
+                    nodes,
+                    self.modalities,
+                    gpu_id,
+                )
+                future_to_node_id[future] = node_ids
+                future_to_retained_shm[future] = retained
 
             def submit_node(node_id: str):
                 node = self.scheduler.mapping[node_id]
@@ -325,6 +449,7 @@ def submit_node(node_id: str):
                         self.result_cache.get(parent_node_id)
                         for parent_node_id in parent_node_ids
                     ]
+
                 if self._is_task_node(node):
                     task_result = ResultEntry(
                         dag=self._get_dag_from_node_ids(node_id),
@@ -332,31 +457,42 @@ def submit_node(node_id: str):
                     )
                     task_results[node_id] = task_result
                     task_idx = int(node.parameters.get("_task_idx", 0))
+                    payload_data = (
+                        self.modalities[0].data
+                        if parent_results is None
+                        else parent_results[0].data
+                    )
+                    retained = self._retain_for_submit(parent_node_ids, payload_data)
                     future = executor.submit(
                         _execute_task_worker,
                         node_id,
                         self.tasks[task_idx],
-                        (
-                            self.modalities[0].data
-                            if parent_results is None
-                            else parent_results[0].data
-                        ),
+                        payload_data,
                         gpu_id,
                     )
                 else:
+                    payload_data = (
+                        self.modalities if parent_results is None else parent_results
+                    )
+                    retained = self._retain_for_submit(parent_node_ids, payload_data)
                     future = executor.submit(
                         _execute_node_worker,
                         node,
-                        self.modalities if parent_results is None else parent_results,
+                        payload_data,
                         None,
                         None,
                         gpu_id,
                     )
                 self.scheduler.move_to_running(node_id)
                 future_to_node_id[future] = node_id
+                future_to_retained_shm[future] = retained
 
             def submit_new_ready_nodes():
-                for node_id in self.scheduler.get_runnable().copy():
+                ready_nodes = self.scheduler.get_runnable().copy()
+                for node_id in ready_nodes:
+                    if isinstance(node_id, list):
+                        submit_nodes_with_leaf_dependencies(node_id)
+                        continue
                     submit_node(node_id)
 
             submit_new_ready_nodes()
@@ -372,94 +508,127 @@ def submit_new_ready_nodes():
 
                 for future in done:
                     node_id = future_to_node_id.pop(future)
+                    retained_shm = future_to_retained_shm.pop(future, [])
                     try:
                         result = future.result()
-                    except Exception as e:
-                        err_cls = type(e)
-                        err_mod = err_cls.__module__
-                        if err_mod.startswith("torch"):
-                            torch.cuda.empty_cache()
-                        print(f"Error executing node {node_id}: {e}")
-                        self.scheduler.add_failed_node(node_id)
-                        continue
-
-                    peak_bytes = result["peak_bytes"]
-                    gpu_peak_bytes = result["gpu_peak_bytes"]
-
-                    node = self.scheduler.mapping[node_id]
-                    if self._is_task_node(node):
-                        task_results[node_id].task_time = result["task_time"]
-                        task_results[node_id].train_score = result["scores"][
-                            0
-                        ].average_scores
-                        task_results[node_id].val_score = result["scores"][
-                            1
-                        ].average_scores
-                        task_results[node_id].test_score = result["scores"][
-                            2
-                        ].average_scores
-                        if self.enable_checkpointing:
-                            self.checkpoint_manager.increment(node_id)
-                            self.checkpoint_manager.checkpoint_if_due(task_results)
-                            self._checkpoint_memory_usage(
-                                node_id,
-                                peak_bytes,
-                                gpu_peak_bytes,
-                                "task",
-                                memory_usage_data,
-                                None,
-                            )
 
-                        parent_node_ids = self.scheduler.get_valid_parents(node_id)
-                        if len(parent_node_ids) > 0:
+                        if isinstance(node_id, list):
+                            results = result["results"]
+                            node_id_by_representation = result[
+                                "node_id_by_representation"
+                            ]
+                            for (
+                                representation,
+                                transformed_modality,
+                            ) in results.items():
+                                batch_node_id = node_id_by_representation[
+                                    representation
+                                ]
+                                self._handle_modality_result(
+                                    transformed_modality,
+                                    batch_node_id,
+                                    None,
+                                    None,
+                                    memory_usage_data,
+                                    representation,
+                                )
+                            submit_new_ready_nodes()
+                            continue
+
+                        peak_bytes = result["peak_bytes"]
+                        gpu_peak_bytes = result["gpu_peak_bytes"]
+                        node = self.scheduler.mapping[node_id]
+                        if self._is_task_node(node):
+                            task_results[node_id].task_time = result["task_time"]
+                            task_results[node_id].train_score = result["scores"][
+                                0
+                            ].average_scores
+                            task_results[node_id].val_score = result["scores"][
+                                1
+                            ].average_scores
+                            task_results[node_id].test_score = result["scores"][
+                                2
+                            ].average_scores
+                            if self.enable_checkpointing:
+                                self.checkpoint_manager.increment(node_id)
+                                self.checkpoint_manager.checkpoint_if_due(task_results)
+                                self._checkpoint_memory_usage(
+                                    node_id,
+                                    peak_bytes,
+                                    gpu_peak_bytes,
+                                    "task",
+                                    memory_usage_data,
+                                    None,
+                                )
+
+                            parent_node_ids = self.scheduler.get_valid_parents(node_id)
                             for parent_node_id in parent_node_ids:
                                 self.result_cache.dec_ref(parent_node_id)
-                                if (
-                                    parent_node_id in self.result_cache.ref_count
-                                    and self.result_cache.ref_count[parent_node_id] == 0
-                                ):
-                                    self.result_cache.clear(parent_node_id)
-                        self.scheduler.complete_node(node_id)
-
-                    else:
-                        transformed_modality = result["result"]
-                        actual_stats = self._infer_actual_output_stats(
-                            transformed_modality
-                        )
-                        estimated_stats = self.scheduler.node_stats.get(node_id)
-
-                        if actual_stats is not None and (
-                            estimated_stats is None
-                            or not getattr(
-                                estimated_stats, "output_shape_is_known", True
-                            )
-                        ):
-                            self.scheduler.update_node_stats_and_reestimate_descendants(
-                                node_id, actual_stats
-                            )
-                        if self.enable_checkpointing:
-                            self._checkpoint_memory_usage(
+                            self.scheduler.complete_node(node_id)
+                        else:
+                            transformed_modality = result["result"]
+                            self._handle_modality_result(
+                                transformed_modality,
                                 node_id,
                                 peak_bytes,
                                 gpu_peak_bytes,
-                                result["operation_name"],
                                 memory_usage_data,
-                                transformed_modality.data,
+                                result["operation_name"],
                             )
-                        before_bytes = self.result_cache.get_memory_total_memory_usage()
-                        self._manage_result_cache(node_id, transformed_modality)
-                        after_bytes = self.result_cache.get_memory_total_memory_usage()
-                        self.scheduler.update_cpu_memory_in_use(
-                            after_bytes - before_bytes
-                        )
-                        self.scheduler.complete_node(node_id)
 
-                    submit_new_ready_nodes()
-        assert len(self.result_cache.ref_count.keys()) == 0
+                        submit_new_ready_nodes()
+                    except Exception:
+                        parent_node_ids = []
+                        if not isinstance(node_id, list):
+                            parent_node_ids = self.scheduler.get_valid_parents(node_id)
+                        for parent_node_id in parent_node_ids:
+                            self.result_cache.dec_ref(parent_node_id)
+                        if not isinstance(node_id, list):
+                            self.scheduler.add_failed_node(node_id)
+                        raise
+                    finally:
+                        self._release_for_future(retained_shm)
+
+        assert not self.result_cache.ref_count
+        assert not self.result_cache._shm_retain_count
 
         self.result_cache.cleanup_all()
         self._cleanup_leaf_shared_memory()
-        return list(task_results.values())
+        return {"task_results": list(task_results.values())}
+
+    def _handle_modality_result(
+        self,
+        transformed_modality: Any,
+        node_id: str,
+        peak_bytes: int,
+        gpu_peak_bytes: int,
+        memory_usage_data,
+        operation_name: str,
+    ):
+        actual_stats = self._infer_actual_output_stats(transformed_modality)
+        estimated_stats = self.scheduler.node_stats.get(node_id)
+
+        if actual_stats is not None and (
+            estimated_stats is None
+            or not getattr(estimated_stats, "output_shape_is_known", True)
+        ):
+            self.scheduler.update_node_stats_and_reestimate_descendants(
+                node_id, actual_stats
+            )
+        if self.enable_checkpointing:
+            self._checkpoint_memory_usage(
+                node_id,
+                peak_bytes,
+                gpu_peak_bytes,
+                operation_name,
+                memory_usage_data,
+                transformed_modality.data,
+            )
+        before_bytes = self.result_cache.get_memory_total_memory_usage()
+        self._manage_result_cache(node_id, transformed_modality)
+        after_bytes = self.result_cache.get_memory_total_memory_usage()
+        self.scheduler.update_cpu_memory_in_use(after_bytes - before_bytes)
+        self.scheduler.complete_node(node_id)
 
     def _materialize_leaf_modalities_in_shared_memory(self):
         self._leaf_shm_names = []
@@ -587,22 +756,14 @@ def _infer_actual_output_stats(
 
     def _manage_result_cache(self, node_id: str, result: Any):
         parent_node_ids = self.scheduler.get_valid_parents(node_id)
-        if len(parent_node_ids) > 0:
-            for parent_node_id in parent_node_ids:
-                self.result_cache.dec_ref(parent_node_id)
+        for parent_node_id in parent_node_ids:
+            self.result_cache.dec_ref(parent_node_id)
 
         if self.scheduler.get_children(node_id):
             for _ in self.scheduler.get_children(node_id):
                 self.result_cache.inc_ref(node_id)
             self.result_cache.add_result(node_id, result)
 
-        for parent_node_id in parent_node_ids:
-            if (
-                parent_node_id in self.result_cache.ref_count
-                and self.result_cache.ref_count[parent_node_id] == 0
-            ):
-                self.result_cache.clear(parent_node_id)
-
     def _get_nodes_by_ids(self, nodes_ids: List[str]) -> List[RepresentationNode]:
         return [self.scheduler.mapping[node_id] for node_id in nodes_ids]
 
diff --git a/src/main/python/systemds/scuro/drsearch/node_scheduler.py b/src/main/python/systemds/scuro/drsearch/node_scheduler.py
index ef3ccc844e5..209f4503860 100644
--- a/src/main/python/systemds/scuro/drsearch/node_scheduler.py
+++ b/src/main/python/systemds/scuro/drsearch/node_scheduler.py
@@ -19,7 +19,7 @@
 #
 # -------------------------------------------------------------
 from __future__ import annotations
-
+import re
 from typing import List, Dict, Optional, Any
 from collections import defaultdict, deque
 from collections import deque
@@ -93,8 +93,25 @@ def get_runnable(self) -> List[RepresentationNode]:
             ok, gpu_id = self._check_memory_constraints(node)
             if ok:
                 self.mapping[node].gpu_id = gpu_id
-                self.ready_nodes.append(node)
                 self._reserve_memory(node, gpu_id)
+                self.ready_nodes.append(node)
+        contains_leaf = []
+        for node in self.ready_nodes:
+            if any(re.fullmatch(r"leaf_\d+", i) for i in self.mapping[node].inputs):
+                for mod in self.modalities:
+                    if (
+                        mod.modality_id
+                        == self.mapping[self.mapping[node].inputs[0]].modality_id
+                    ):
+                        if mod.data_loader.chunk_size is not None:
+                            contains_leaf.append(node)
+                            break
+
+        for node in contains_leaf:
+            self.ready_nodes.remove(node)
+
+        if len(contains_leaf) > 0:
+            self.ready_nodes.append(contains_leaf)
         return self.ready_nodes
 
     def _get_runnable_nodes(self) -> List[str]:
@@ -128,9 +145,12 @@ def add_failed_node(self, node_id: str):
 
         self._release_memory(node_id, self.mapping[node_id].gpu_id)
 
-    def move_to_running(self, node_id: str):
+    def move_to_running(self, node_id: str | list):
         self.ready_nodes.remove(node_id)
-        self.running_nodes.append(node_id)
+        if isinstance(node_id, list):
+            self.running_nodes.extend(node_id)
+        else:
+            self.running_nodes.append(node_id)
 
     def complete_node(self, node_id: str):
         self.running_nodes.remove(node_id)
diff --git a/src/main/python/systemds/scuro/drsearch/representation_dag.py b/src/main/python/systemds/scuro/drsearch/representation_dag.py
index 099732e46df..b1d5835ad3f 100644
--- a/src/main/python/systemds/scuro/drsearch/representation_dag.py
+++ b/src/main/python/systemds/scuro/drsearch/representation_dag.py
@@ -37,6 +37,17 @@
 from collections import OrderedDict, defaultdict, deque
 
 
+def pushdown_aggregation_for_node(
+    node_parameters: Optional[Dict[str, Any]],
+) -> Optional[AggregatedRepresentation]:
+    if not node_parameters:
+        return None
+    pushdown_config = node_parameters.get("_pushdown_aggregation")
+    if pushdown_config is None:
+        return None
+    return AggregatedRepresentation(params=pushdown_config)
+
+
 class LRUCache:
     def __init__(self, max_size: int = 256):
         self.max_size = max_size
@@ -231,6 +242,23 @@ def _compute_node_signature(self, node, input_sig_tuple) -> Hashable:
         params_items = tuple(sorted((node.parameters or {}).items()))
         return ("op", op_cls, params_items, input_sig_tuple)
 
+    def get_represntation_names(self) -> str:
+        representation_names = []
+        visited = set()
+
+        def visit_node(node_id):
+            if node_id in visited:
+                return
+            node = self.get_node_by_id(node_id)
+            for input_id in node.inputs:
+                visit_node(input_id)
+            visited.add(node_id)
+            if node.operation is not None:
+                representation_names.append(node.operation().name)
+
+        visit_node(self.root_node_id)
+        return " -> ".join(representation_names)
+
     def execute(
         self,
         modalities: List[Modality],
@@ -282,9 +310,9 @@ def execute_node(node_id: str, task) -> TransformedModality:
                             if rep_cache is not None:
                                 result = rep_cache[node_operation.name]
                             else:
-                                # Compute the representation
+                                agg = pushdown_aggregation_for_node(node.parameters)
                                 result = input_mods[0].apply_representation(
-                                    node_operation
+                                    node_operation, aggregation=agg
                                 )
                     else:
                         # It's a fusion operation
@@ -314,8 +342,10 @@ def execute_node(node_id: str, task) -> TransformedModality:
                         if rep_cache is not None:
                             result = rep_cache[node_operation.name]
                         else:
-                            # Compute the representation
-                            result = input_mods[0].apply_representation(node_operation)
+                            agg = pushdown_aggregation_for_node(node.parameters)
+                            result = input_mods[0].apply_representation(
+                                node_operation, aggregation=agg
+                            )
                 else:
                     # It's a fusion operation
                     fusion_op = node_operation
@@ -387,13 +417,17 @@ def get_modality_by_id_and_instance_id(
     modalities: List[Modality], modality_id: int, instance_id: int
 ):
     counter = 0
+    modality_per_id = {}
     for modality in modalities:
-        if modality.modality_id == modality_id:
-            if counter == instance_id or instance_id == -1:
-                return modality
-            else:
-                counter += 1
-    return None
+        if modality.modality_id not in modality_per_id:
+            modality_per_id[modality.modality_id] = []
+        modality_per_id[modality.modality_id].append(modality)
+    if modality_id not in modality_per_id:
+        return None
+    if instance_id == -1 or len(modality_per_id[modality_id]) == 1:
+        return modality_per_id[modality_id][0]
+    else:
+        return modality_per_id[modality_id][instance_id]
 
 
 class RepresentationDAGBuilder:
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 215f27929f3..8dc2e1a082f 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -313,6 +313,37 @@ def _process_modality(self, modality, skip_remaining: int = 0, scheduler=None):
             metric_name=self.metric_name,
         )
 
+        dags, dags_with_pushdown, expanded_dags_with_task_roots = (
+            self._build_execution_dags_for_modality(modality, skip_remaining)
+        )
+
+        node_executor = NodeExecutor(
+            expanded_dags_with_task_roots,
+            [modality],
+            self.tasks,
+            self._checkpoint_manager,
+            self.max_num_workers,
+            self.result_path,
+            enable_checkpointing=self.enable_checkpointing,
+        )
+
+        exec_out = node_executor.run()
+        task_results = exec_out["task_results"]
+
+        for task_result in task_results:
+            local_results.add_task_result(task_result, dags)
+
+        if self.save_all_results:
+            timestr = time.strftime("%Y%m%d-%H%M%S")
+            file_name = f"{modality.modality_id}_unimodal_results_{timestr}.pkl"
+            with open(file_name, "wb") as f:
+                pickle.dump(local_results.results, f)
+
+        return local_results
+
+    def _build_execution_dags_for_modality(
+        self, modality: Modality, skip_remaining: int = 0
+    ) -> tuple:
         modality_specific_operators = self._get_modality_operators(
             modality.modality_type
         )
@@ -341,28 +372,7 @@ def _process_modality(self, modality, skip_remaining: int = 0, scheduler=None):
         expanded_dags_with_task_roots = self._expand_dags_with_task_roots(
             dags_with_pushdown
         )
-
-        node_executor = NodeExecutor(
-            expanded_dags_with_task_roots,
-            [modality],
-            self.tasks,
-            self._checkpoint_manager,
-            self.max_num_workers,
-            self.result_path,
-            enable_checkpointing=self.enable_checkpointing,
-        )
-        task_results = node_executor.run()
-
-        for task_result in task_results:
-            local_results.add_task_result(task_result, dags)
-
-        if self.save_all_results:
-            timestr = time.strftime("%Y%m%d-%H%M%S")
-            file_name = f"{modality.modality_id}_unimodal_results_{timestr}.pkl"
-            with open(file_name, "wb") as f:
-                pickle.dump(local_results.results, f)
-
-        return local_results
+        return dags, dags_with_pushdown, expanded_dags_with_task_roots
 
     def _merge_results(self, local_results):
         for modality_id in local_results.results:
@@ -498,15 +508,22 @@ def _build_modality_dag(
         return dags
 
     def _aggregation_needed(self, dag: RepresentationDag) -> bool:
+        input_stats = {}
+        # TODO: adapt this to the fusion of multiple modalities, list of input stats needed
         for modality in self.modalities:
             if modality.modality_id == dag.nodes[0].modality_id:
-                last_stats = modality.stats
+                input_stats[dag.nodes[0].node_id] = modality.stats
                 break
         for node in dag.nodes[1:]:
-            last_stats = node.operation(params=node.parameters).get_output_stats(
-                last_stats
+            previous_stats = [
+                input_stats.get(input_node_id, None) for input_node_id in node.inputs
+            ]
+            current_stats = node.operation(params=node.parameters).get_output_stats(
+                previous_stats if len(previous_stats) > 1 else previous_stats[0]
             )
-        return len(last_stats.output_shape) > 1
+            input_stats[node.node_id] = current_stats
+
+        return len(input_stats.get(dag.root_node_id, None).output_shape) > 1
 
     def add_aggregation_operator(self, builder, dags):
         new_dags = []
@@ -675,7 +692,12 @@ def print_results(self):
                     print(f"{modality}_{task_name}: {entry}")
 
     def get_k_best_results(
-        self, modality, task, performance_metric_name, prune_cache=False
+        self,
+        modality,
+        task,
+        performance_metric_name,
+        prune_cache=False,
+        cache_needed=True,
     ):
         """
         Get the k best results for the given modality
@@ -693,36 +715,38 @@ def get_k_best_results(
 
         results = results[: self.k]
         sorted_indices = sorted_indices[: self.k]
-        task_cache = self.cache.get(modality.modality_id, {}).get(task.model.name, None)
-        if not task_cache:
-            cache = []
-            for result in results:
-                if result.dag.nodes[-1].parameters.get("_node_kind", False) == "task":
-                    dag = copy.deepcopy(result.dag)
-                    dag.nodes = dag.nodes[:-1]
-                    dag.root_node_id = dag.nodes[-1].node_id
-                    cache.append(dag.execute([modality]))
-
-        elif isinstance(task_cache, list):
-            cache = task_cache
-        else:
-            cache_items = list(task_cache.items()) if task_cache else []
-            cache = [cache_items[i][1] for i in sorted_indices if i < len(cache_items)]
-
-        if prune_cache:
-            # Note: in case the unimodal results are loaded from a file, we need to initialize the cache for the modality and task
-            if modality.modality_id not in self.operator_performance.cache:
-                self.operator_performance.cache[modality.modality_id] = {}
-            if (
-                task.model.name
-                not in self.operator_performance.cache[modality.modality_id]
-            ):
+        cache = []
+        if cache_needed:
+            task_cache = self.cache.get(modality.modality_id, {}).get(
+                task.model.name, None
+            )
+            if not task_cache:
+                cache = []
+                for result in results:
+                    cache.append(result.dag.execute([modality]))
+
+            elif isinstance(task_cache, list):
+                cache = task_cache
+            else:
+                cache_items = list(task_cache.items()) if task_cache else []
+                cache = [
+                    cache_items[i][1] for i in sorted_indices if i < len(cache_items)
+                ]
+
+            if prune_cache:
+                # Note: in case the unimodal results are loaded from a file, we need to initialize the cache for the modality and task
+                if modality.modality_id not in self.operator_performance.cache:
+                    self.operator_performance.cache[modality.modality_id] = {}
+                if (
+                    task.model.name
+                    not in self.operator_performance.cache[modality.modality_id]
+                ):
+                    self.operator_performance.cache[modality.modality_id][
+                        task.model.name
+                    ] = {}
                 self.operator_performance.cache[modality.modality_id][
                     task.model.name
-                ] = {}
-            self.operator_performance.cache[modality.modality_id][
-                task.model.name
-            ] = cache
+                ] = cache
 
         return results, cache
 
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 84204ac570d..0535c64bcee 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -18,6 +18,7 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from concurrent.futures import ThreadPoolExecutor, as_completed
 import gc
 import time
 import numpy as np
@@ -135,7 +136,7 @@ def aggregate(self, aggregation_function):
         if self.data is None:
             raise Exception("Data is None")
 
-    def apply_representations(self, representations, aggregation=None):
+    def apply_representations(self, representations, aggregation=None, parallel=False):
         """
         Applies a list of representations to the modality. Specifically, it applies the representations to the modality in a chunked manner.
         :param representations: List of representations to apply
@@ -158,20 +159,29 @@ def apply_representations(self, representations, aggregation=None):
             time.time()
         )  # TODO: should be repalced in unimodal_representation.transform
         if self.data_loader.chunk_size:
-            for _ in self.iter_raw_data_chunks(reset=True):
-                for representation in representations:
-                    transformed_chunk = representation.transform(self)
-                    transformed_modalities_per_representation[
-                        representation.name
-                    ].data.extend(transformed_chunk.data)
-                    transformed_modalities_per_representation[
-                        representation.name
-                    ].metadata.extend(transformed_chunk.metadata)
-                    for d in transformed_chunk.data:
-                        original_lengths_per_representation[representation.name].append(
-                            d.shape[0]
-                        )
-
+            with ThreadPoolExecutor(
+                max_workers=len(representations) if parallel else 1
+            ) as executor:
+                time_s = time.time()
+                for _ in self.iter_raw_data_chunks(reset=True):
+                    representations_futures = {}
+                    for representation in representations:
+                        future = executor.submit(representation.transform, self)
+                        representations_futures[future] = representation.name
+                    for future in as_completed(representations_futures.keys()):
+                        representation_name = representations_futures.get(future)
+                        transformed_chunk = future.result()
+                        transformed_modalities_per_representation[
+                            representation_name
+                        ].data.extend(transformed_chunk.data)
+                        transformed_modalities_per_representation[
+                            representation_name
+                        ].metadata.extend(transformed_chunk.metadata)
+                        for d in transformed_chunk.data:
+                            original_lengths_per_representation[
+                                representation_name
+                            ].append(d.shape[0])
+                print(f"Time for transforming data chunks: {time.time() - time_s}")
         else:
             if not self.has_data():
                 self.extract_raw_data()
diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py
index 518cc1eb5dc..2c880686b11 100644
--- a/src/main/python/systemds/scuro/representations/clip.py
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -21,6 +21,7 @@
 import numpy as np
 from torchvision import transforms
 
+from systemds.scuro.dataloader.video_loader import VideoStats
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.representation import RepresentationStats
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -48,14 +49,20 @@
 
 @register_representation([ModalityType.VIDEO, ModalityType.IMAGE])
 class CLIPVisual(UnimodalRepresentation):
-    def __init__(self, output_file=None, batch_size=32, params=None):
-        parameters = {}
+    def __init__(self, output_file=None, batch_size=32, layer_name="", params=None):
+        parameters = self._get_parameters()
         super().__init__("CLIPVisual", ModalityType.EMBEDDING, parameters)
         self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        if params is not None:
+            self.batch_size = int(params.get("batch_size", batch_size))
+            self.layer_name = params.get("layer_name", layer_name)
+        else:
+            self.batch_size = batch_size
+            self.layer_name = layer_name
         self.output_file = output_file
+
         self.data_type = torch.float32
-        self.batch_size = batch_size
         self.gpu_id = None
         self.device = get_device()
 
@@ -68,11 +75,42 @@ def gpu_id(self, gpu_id):
         self._gpu_id = gpu_id
         self.device = get_device(gpu_id)
 
+    def _get_parameters(self):
+        parameters = {
+            "batch_size": [1, 2, 4, 8, 16, 32, 64, 128],
+            "layer_name": [
+                "",
+                "encoder.layers.0.layer_norm2",
+                "encoder.layers.1.layer_norm2",
+                "encoder.layers.2.layer_norm2",
+                "encoder.layers.3.layer_norm2",
+                "encoder.layers.4.layer_norm2",
+                "encoder.layers.5.layer_norm2",
+                "encoder.layers.6.layer_norm2",
+                "encoder.layers.7.layer_norm2",
+                "encoder.layers.8.layer_norm2",
+                "encoder.layers.9.layer_norm2",
+                "encoder.layers.10.layer_norm2",
+                "encoder.layers.11.layer_norm2",
+                "post_layernorm",
+            ],
+        }
+
+        return parameters
+
     def estimate_output_memory_bytes(self, input_stats) -> int:
         return input_stats.num_instances * 512 * self.data_type.itemsize
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
-        if not isinstance(input_stats, RepresentationStats):
+        if isinstance(input_stats, VideoStats):
+            return RepresentationStats(
+                input_stats.num_instances,
+                (
+                    input_stats.max_length,
+                    512,
+                ),
+            )
+        elif not isinstance(input_stats, RepresentationStats):
             return RepresentationStats(input_stats.num_instances, (512,))
         else:
             return RepresentationStats(
@@ -177,6 +215,21 @@ def transform(self, modality, aggregation=None):
             self.model = self.model.to(self.data_type)
 
         self.model = self.model.to(self.device)
+        self.clip_output = None
+
+        def get_activation(name):
+            def hook(model, input, output):
+                self.clip_output = (
+                    output[0].detach() if isinstance(output, tuple) else output.detach()
+                )
+
+            return hook
+
+        if self.layer_name != "":
+            for name, layer in self.model.vision_model.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_activation(name))
+                    break
 
         embeddings = self.create_visual_embeddings(modality)
 
@@ -212,9 +265,14 @@ def create_visual_embeddings(self, modality):
                 inputs.to(self.device)
 
                 with torch.no_grad():
-                    output = self.model.get_image_features(**inputs)
-                if len(output.shape) > 2:
-                    output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))
+                    if self.layer_name != "":
+                        _ = self.model.vision_model(**inputs)
+                        output = self.clip_output
+                    else:
+                        output = self.model.get_image_features(**inputs)
+
+                output = self._pool_visual_output(output)
+
                 embeddings.extend(
                     torch.flatten(output, 1)
                     .detach()
@@ -241,13 +299,13 @@ def create_visual_embeddings(self, modality):
                 )
                 inputs.to(self.device)
                 with torch.no_grad():
-                    output = self.model.get_image_features(**inputs)
+                    if self.layer_name != "":
+                        _ = self.model.vision_model(**inputs)
+                        output = self.clip_output
+                    else:
+                        output = self.model.get_image_features(**inputs)
 
-                if hasattr(output, "pooler_output"):
-                    output = output.pooler_output
-
-                if len(output.shape) > 2:
-                    output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))
+                output = self._pool_visual_output(output)
 
                 embeddings[id].extend(
                     torch.flatten(output, 1)
@@ -261,13 +319,28 @@ def create_visual_embeddings(self, modality):
             embeddings[id] = np.array(embeddings[id])
         return list(embeddings.values())
 
+    def _pool_visual_output(self, output: torch.Tensor) -> torch.Tensor:
+        if output.ndim == 4:
+            output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))
+            return torch.flatten(output, 1)
+        if output.ndim == 3:
+            return output.mean(dim=1)
+        if output.ndim == 2:
+            return output
+        raise ValueError(f"Unexpected CLIP visual output shape: {tuple(output.shape)}")
+
 
 @register_representation(ModalityType.TEXT)
 class CLIPText(UnimodalRepresentation):
-    def __init__(self, output_file=None, batch_size=32, params=None):
-        self.batch_size = batch_size
+    def __init__(self, output_file=None, batch_size=32, layer_name="", params=None):
+        if params is not None:
+            self.batch_size = int(params.get("batch_size", batch_size))
+            self.layer_name = params.get("layer_name", layer_name)
+        else:
+            self.batch_size = batch_size
+            self.layer_name = layer_name
         self.max_seq_length = 77
-        parameters = {"batch_size": [1, 2, 4, 8, 16, 32, 64, 128]}
+        parameters = self._get_parameters()
 
         super().__init__("CLIPText", ModalityType.EMBEDDING, parameters)
         self.model = None
@@ -295,6 +368,29 @@ def estimate_output_memory_bytes(self, input_stats) -> int:
             input_stats.num_instances * np.prod(output_stats) * self.data_type.itemsize
         )
 
+    def _get_parameters(self):
+        parameters = {
+            "batch_size": [1, 2, 4, 8, 16, 32, 64, 128],
+            "layer_name": [
+                "",
+                "encoder.layers.0.layer_norm2",
+                "encoder.layers.1.layer_norm2",
+                "encoder.layers.2.layer_norm2",
+                "encoder.layers.3.layer_norm2",
+                "encoder.layers.4.layer_norm2",
+                "encoder.layers.5.layer_norm2",
+                "encoder.layers.6.layer_norm2",
+                "encoder.layers.7.layer_norm2",
+                "encoder.layers.8.layer_norm2",
+                "encoder.layers.9.layer_norm2",
+                "encoder.layers.10.layer_norm2",
+                "encoder.layers.11.layer_norm2",
+                "final_layer_norm",
+            ],
+        }
+
+        return parameters
+
     def get_output_stats(self, input_stats) -> RepresentationStats:
         if not isinstance(input_stats, RepresentationStats):
             self.stats = RepresentationStats(
@@ -379,6 +475,21 @@ def transform(self, modality, aggregation=None):
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         self.model = self.model.to(self.device)
+        self.clip_output = None
+
+        def get_activation(name):
+            def hook(model, input, output):
+                self.clip_output = (
+                    output[0].detach() if isinstance(output, tuple) else output.detach()
+                )
+
+            return hook
+
+        if self.layer_name != "":
+            for name, layer in self.model.text_model.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_activation(name))
+                    break
 
         if ModalityType.TEXT.has_field(modality.metadata, "text_spans"):
             dataset = TextSpanDataset(modality.data, modality.metadata)
@@ -415,9 +526,15 @@ def create_text_embeddings(self, data, model, aggregation=None):
             )
             inputs.to(self.device)
             with torch.no_grad():
-                text_features = model.get_text_features(**inputs)
+                if self.layer_name != "":
+                    _ = model.text_model(**inputs)
+
+                    batch_np = self.clip_output.cpu().float().numpy()
+                    if batch_np.ndim == 3:
+                        batch_np = batch_np.mean(axis=1)
+                else:
+                    batch_np = model.get_text_features(**inputs).cpu().float().numpy()
 
-                batch_np = text_features.detach().cpu().float().numpy()
                 if aggregation is not None:
                     batch_np = aggregation.execute(batch_np)
 
diff --git a/src/main/python/systemds/scuro/representations/covarep_audio_features.py b/src/main/python/systemds/scuro/representations/covarep_audio_features.py
index 01098ef4ee1..973b7a99c2f 100644
--- a/src/main/python/systemds/scuro/representations/covarep_audio_features.py
+++ b/src/main/python/systemds/scuro/representations/covarep_audio_features.py
@@ -30,6 +30,11 @@
     register_representation,
     register_context_representation_operator,
 )
+from systemds.scuro.utils.static_variables import (
+    NP_ARRAY_HEADER_BYTES,
+    PY_LIST_HEADER_BYTES,
+    PY_LIST_SLOT_BYTES,
+)
 
 
 @register_representation(ModalityType.AUDIO)
@@ -97,11 +102,32 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
         return RepresentationStats(num_instances, (num_frames, 4))
 
     def estimate_peak_memory_bytes(self, input_stats) -> dict:
-        # TODO
-        return {
-            "cpu_peak_bytes": 0,
-            "gpu_peak_bytes": 0,
-        }
+        num_frames = 1 + max((input_stats.max_length - 1) // int(self.hop_length), 0)
+        num_frames = max(int(num_frames), 1)
+
+        out_elem = np.dtype(np.float32).itemsize
+        output_payload_per_instance = num_frames * 4 * out_elem
+        retained_output_bytes = PY_LIST_HEADER_BYTES + input_stats.num_instances * (
+            output_payload_per_instance + NP_ARRAY_HEADER_BYTES + PY_LIST_SLOT_BYTES
+        )
+
+        num_freq_bins = 1 + 2048 // 2
+        stft_bytes = num_frames * num_freq_bins * np.dtype(np.complex64).itemsize
+        magnitude_bytes = num_frames * num_freq_bins * np.dtype(np.float32).itemsize
+        per_feature_bytes = num_frames * out_elem
+        stacked_bytes = 4 * num_frames * out_elem
+        fft_workspace_bytes = max(2 * 1024 * 1024, stft_bytes // 2)
+        transient_one_instance = (
+            4 * per_feature_bytes
+            + stacked_bytes
+            + stft_bytes
+            + magnitude_bytes
+            + fft_workspace_bytes
+        )
+        cpu_peak = int(
+            (retained_output_bytes + transient_one_instance) * 1.15 + 16 * 1024 * 1024
+        )
+        return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
 
 
 @register_representation(ModalityType.AUDIO)
@@ -145,14 +171,26 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
             num_frames = 1 + max(int((signal_length - 1) // self.hop_length), 0)
             num_frames = max(int(num_frames), 1)
 
-        return RepresentationStats(num_instances, (num_frames, 1))
+        return RepresentationStats(num_instances, (1, num_frames))
 
     def estimate_peak_memory_bytes(self, input_stats) -> dict:
-        # TODO
-        return {
-            "cpu_peak_bytes": 0,
-            "gpu_peak_bytes": 0,
-        }
+        num_frames = 1 + max((input_stats.max_length - 1) // int(self.hop_length), 0)
+
+        out_elem = np.dtype(np.float32).itemsize
+        output_payload_per_instance = num_frames * out_elem
+        retained_output_bytes = PY_LIST_HEADER_BYTES + input_stats.num_instances * (
+            output_payload_per_instance + NP_ARRAY_HEADER_BYTES + PY_LIST_SLOT_BYTES
+        )
+        framed_bytes = 2048 * num_frames * np.dtype(np.float32).itemsize
+        crossings_mask_bytes = 2048 * num_frames
+        output_instance_bytes = output_payload_per_instance
+        transient_one_instance = (
+            framed_bytes + crossings_mask_bytes + output_instance_bytes
+        )
+        cpu_peak = int(
+            (retained_output_bytes + transient_one_instance) * 1.15 + 8 * 1024 * 1024
+        )
+        return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
 
 
 @register_representation(ModalityType.AUDIO)
@@ -197,14 +235,28 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
             num_frames = 1 + max(int((signal_length - 1) // self.hop_length), 0)
             num_frames = max(int(num_frames), 1)
 
-        return RepresentationStats(num_instances, (num_frames, 1))
+        return RepresentationStats(num_instances, (1, num_frames))
 
     def estimate_peak_memory_bytes(self, input_stats) -> dict:
-        # TODO
-        return {
-            "cpu_peak_bytes": 0,
-            "gpu_peak_bytes": 0,
-        }
+        num_frames = 1 + max((input_stats.max_length - 1) // int(self.hop_length), 0)
+        num_frames = max(int(num_frames), 1)
+        out_elem = np.dtype(np.float32).itemsize
+        output_payload_per_instance = num_frames * out_elem
+        retained_output_bytes = PY_LIST_HEADER_BYTES + input_stats.num_instances * (
+            output_payload_per_instance + NP_ARRAY_HEADER_BYTES + PY_LIST_SLOT_BYTES
+        )
+        frame_len = int(self.frame_length)
+        framed_bytes = frame_len * num_frames * np.dtype(np.float32).itemsize
+        squared_bytes = framed_bytes
+        mean_bytes = num_frames * np.dtype(np.float32).itemsize
+        output_instance_bytes = output_payload_per_instance
+        transient_one_instance = (
+            framed_bytes + squared_bytes + mean_bytes + output_instance_bytes
+        )
+        cpu_peak = int(
+            (retained_output_bytes + transient_one_instance) * 1.15 + 8 * 1024 * 1024
+        )
+        return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
 
 
 @register_representation(ModalityType.AUDIO)
@@ -251,11 +303,33 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
             num_frames = 1 + max(int((signal_length - 1) // self.hop_length), 0)
             num_frames = max(int(num_frames), 1)
 
-        return RepresentationStats(num_instances, (num_frames, 1))
+        return RepresentationStats(num_instances, (1, num_frames))
 
     def estimate_peak_memory_bytes(self, input_stats) -> dict:
-        # TODO
-        return {
-            "cpu_peak_bytes": 0,
-            "gpu_peak_bytes": 0,
-        }
+        num_frames = 1 + max((input_stats.max_length - 1) // int(self.hop_length), 0)
+        num_frames = max(int(num_frames), 1)
+        out_elem = np.dtype(np.float32).itemsize
+        output_payload_per_instance = num_frames * out_elem
+        retained_output_bytes = PY_LIST_HEADER_BYTES + input_stats.num_instances * (
+            output_payload_per_instance + NP_ARRAY_HEADER_BYTES + PY_LIST_SLOT_BYTES
+        )
+        n_fft = 2048
+        num_freq_bins = 1 + n_fft // 2
+        stft_bytes = num_frames * num_freq_bins * np.dtype(np.complex64).itemsize
+        pitches_bytes = num_frames * num_freq_bins * np.dtype(np.float32).itemsize
+        magnitudes_bytes = pitches_bytes
+        argmax_idx_bytes = num_frames * np.dtype(np.int64).itemsize
+        gathered_pitch_bytes = output_payload_per_instance
+        fft_workspace_bytes = max(2 * 1024 * 1024, stft_bytes // 2)
+        transient_one_instance = (
+            stft_bytes
+            + pitches_bytes
+            + magnitudes_bytes
+            + argmax_idx_bytes
+            + gathered_pitch_bytes
+            + fft_workspace_bytes
+        )
+        cpu_peak = int(
+            (retained_output_bytes + transient_one_instance) * 1.15 + 16 * 1024 * 1024
+        )
+        return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": 0}
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 46e5045b2eb..6d378806475 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -30,6 +30,11 @@
     register_representation,
     register_context_representation_operator,
 )
+from systemds.scuro.utils.static_variables import (
+    NP_ARRAY_HEADER_BYTES,
+    PY_LIST_HEADER_BYTES,
+    PY_LIST_SLOT_BYTES,
+)
 
 
 @register_representation(ModalityType.AUDIO)
@@ -76,7 +81,10 @@ def compute_feature(self, instance, sr=None):
             hop_length=self.hop_length,
             n_fft=self.n_fft,
         )
-        return S.T
+        if instance.ndim == 1:
+            return S.T
+
+        return S.transpose(0, 2, 1)
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
         num_instances = getattr(input_stats, "num_instances", 0)
@@ -94,7 +102,52 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
             if signal_length < self.n_fft:
                 num_frames = 1
             else:
-                num_frames = 1 + (signal_length - self.n_fft) // self.hop_length
+                num_frames = 1 + signal_length // self.hop_length
             num_frames = max(int(num_frames), 1)
 
         return RepresentationStats(num_instances, (num_frames, self.n_mels))
+
+    def estimate_peak_memory_bytes(self, input_stats) -> dict:
+        n = int(getattr(input_stats, "num_instances", 0))
+        if hasattr(input_stats, "max_length"):
+            signal_length = int(getattr(input_stats, "max_length", 0))
+        elif hasattr(input_stats, "output_shape") and input_stats.output_shape:
+            signal_length = int(input_stats.output_shape[0])
+        else:
+            signal_length = 0
+
+        if signal_length <= 0:
+            num_frames = 1
+        elif signal_length < self.n_fft:
+            num_frames = 1
+        else:
+            num_frames = 1 + (signal_length) // self.hop_length
+            num_frames = max(int(num_frames), 1)
+
+        out_elem = np.dtype(np.float32).itemsize
+        num_freq_bins = 1 + self.n_fft // 2
+        output_payload_per_instance = num_frames * self.n_mels * out_elem
+        retained_output_bytes = PY_LIST_HEADER_BYTES + n * (
+            output_payload_per_instance + NP_ARRAY_HEADER_BYTES + PY_LIST_SLOT_BYTES
+        )
+
+        input_copy_bytes = max(signal_length, 1) * out_elem
+        stft_bytes = num_frames * num_freq_bins * np.dtype(np.complex64).itemsize
+        power_spec_bytes = num_frames * num_freq_bins * out_elem
+        mel_output_bytes = output_payload_per_instance
+        fft_workspace_bytes = max(2 * 1024 * 1024, stft_bytes // 2)
+
+        transient_one_instance = (
+            input_copy_bytes
+            + stft_bytes
+            + power_spec_bytes
+            + mel_output_bytes
+            + fft_workspace_bytes
+        )
+        cpu_peak = int(
+            (retained_output_bytes + transient_one_instance) * 2 + 12 * 1024 * 1024
+        )
+        return {
+            "cpu_peak_bytes": cpu_peak,
+            "gpu_peak_bytes": 0,
+        }
diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py
index 737a3dffe95..406fc6616c1 100644
--- a/src/main/python/systemds/scuro/representations/mfcc.py
+++ b/src/main/python/systemds/scuro/representations/mfcc.py
@@ -30,6 +30,11 @@
     register_representation,
     register_context_representation_operator,
 )
+from systemds.scuro.utils.static_variables import (
+    NP_ARRAY_HEADER_BYTES,
+    PY_LIST_HEADER_BYTES,
+    PY_LIST_SLOT_BYTES,
+)
 
 
 @register_representation(ModalityType.AUDIO)
@@ -81,8 +86,18 @@ def compute_feature(self, instance, sr=None):
             hop_length=self.hop_length,
             n_mels=self.n_mels,
         )
-        mfcc = (mfcc - np.mean(mfcc)) / np.std(mfcc)
-        return mfcc.T
+        if mfcc.ndim == 2:
+            mean = np.mean(mfcc, keepdims=True)
+            std = np.std(mfcc, keepdims=True)
+        else:
+            mean = np.mean(mfcc, axis=(1, 2), keepdims=True)
+            std = np.std(mfcc, axis=(1, 2), keepdims=True)
+        mfcc = (mfcc - mean) / np.maximum(std, 1e-8)
+
+        if instance.ndim == 1:
+            return mfcc.T
+
+        return mfcc.transpose(0, 2, 1)
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
         num_instances = getattr(input_stats, "num_instances", 0)
@@ -101,3 +116,51 @@ def get_output_stats(self, input_stats) -> RepresentationStats:
             num_frames = max(int(num_frames), 1)
 
         return RepresentationStats(num_instances, (num_frames, self.n_mfcc))
+
+    def estimate_peak_memory_bytes(self, input_stats) -> dict:
+        n = int(getattr(input_stats, "num_instances", 0))
+        if hasattr(input_stats, "max_length"):
+            signal_length = int(getattr(input_stats, "max_length", 0))
+        elif hasattr(input_stats, "output_shape") and input_stats.output_shape:
+            signal_length = int(input_stats.output_shape[0])
+        else:
+            signal_length = 0
+
+        if signal_length <= 0:
+            num_frames = 1
+        else:
+            num_frames = 1 + max(int((signal_length - 1) // self.hop_length), 0)
+            num_frames = max(int(num_frames), 1)
+
+        out_elem = np.dtype(np.float32).itemsize
+        n_fft = 2048
+        num_freq_bins = 1 + n_fft // 2
+        output_payload_per_instance = num_frames * self.n_mfcc * out_elem
+        retained_output_bytes = PY_LIST_HEADER_BYTES + n * (
+            output_payload_per_instance + NP_ARRAY_HEADER_BYTES + PY_LIST_SLOT_BYTES
+        )
+
+        input_copy_bytes = max(signal_length, 1) * out_elem
+        stft_bytes = num_frames * num_freq_bins * np.dtype(np.complex64).itemsize
+        magnitude_bytes = num_frames * num_freq_bins * out_elem
+        mel_projection_bytes = num_frames * self.n_mels * out_elem
+        dct_output_bytes = output_payload_per_instance
+        norm_workspace_bytes = max(dct_output_bytes, 256 * 1024)
+        fft_workspace_bytes = max(2 * 1024 * 1024, stft_bytes // 2)
+
+        transient_one_instance = (
+            input_copy_bytes
+            + stft_bytes
+            + magnitude_bytes
+            + mel_projection_bytes
+            + dct_output_bytes
+            + norm_workspace_bytes
+            + fft_workspace_bytes
+        )
+        cpu_peak = int(
+            (retained_output_bytes + transient_one_instance) * 1.15 + 16 * 1024 * 1024
+        )
+        return {
+            "cpu_peak_bytes": cpu_peak,
+            "gpu_peak_bytes": 0,
+        }
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 299ccad3683..1202748aa63 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -19,6 +19,7 @@
 #
 # -------------------------------------------------------------
 from systemds.scuro.dataloader.image_loader import ImageStats
+from systemds.scuro.dataloader.video_loader import VideoStats
 from systemds.scuro.representations.representation import RepresentationStats
 from systemds.scuro.utils.torch_dataset import CustomDataset
 from systemds.scuro.modality.transformed import TransformedModality
@@ -33,12 +34,17 @@
 from systemds.scuro.utils.static_variables import get_device
 
 
+class Identity(torch.nn.Module):
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return input_
+
+
 @register_representation([ModalityType.IMAGE, ModalityType.VIDEO])
 class ResNet(UnimodalRepresentation):
     def __init__(
         self,
         model_name="ResNet18",
-        layer="avgpool",
+        layer_name="avgpool",
         output_file=None,
         batch_size=32,
         params=None,
@@ -47,21 +53,21 @@ def __init__(
         self.model = None
         self.gpu_id = None
         self.device = get_device()
+        if params is not None:
+            self.batch_size = int(params.get("batch_size", batch_size))
+            self.layer_name = params.get("layer_name", layer_name)
+        else:
+            self.batch_size = batch_size
+            self.layer_name = layer_name
         self.model_name = model_name
-        self.batch_size = batch_size
         parameters = self._get_parameters()
         super().__init__("ResNet", ModalityType.EMBEDDING, parameters)
 
         self.output_file = output_file
-        self.layer_name = layer
         self.model.eval()
         for param in self.model.parameters():
             param.requires_grad = False
 
-        class Identity(torch.nn.Module):
-            def forward(self, input_: torch.Tensor) -> torch.Tensor:
-                return input_
-
         self.model.fc = Identity()
 
     @property
@@ -109,9 +115,24 @@ def model_name(self, model_name):
             raise NotImplementedError
 
     def estimate_output_memory_bytes(self, input_stats: ImageStats) -> int:
+        if isinstance(input_stats, VideoStats):
+            return (
+                input_stats.num_instances
+                * input_stats.max_length
+                * 512
+                * self.data_type.itemsize
+            )
         return input_stats.num_instances * 512 * self.data_type.itemsize
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
+        if isinstance(input_stats, VideoStats):
+            return RepresentationStats(
+                input_stats.num_instances,
+                (
+                    input_stats.max_length,
+                    512,
+                ),
+            )
         return RepresentationStats(input_stats.num_instances, (512,))
 
     def estimate_peak_memory_bytes(self, input_stats: ImageStats) -> dict:
@@ -122,6 +143,9 @@ def estimate_peak_memory_bytes(self, input_stats: ImageStats) -> dict:
             * input_stats.max_channels
             * self.data_type.itemsize
         )
+        if isinstance(input_stats, VideoStats):
+            input_bytes = input_bytes * input_stats.max_length
+
         output_bytes = self.estimate_output_memory_bytes(input_stats)
         output_bytes_batch = output_bytes / input_stats.num_instances * self.batch_size
 
diff --git a/src/main/python/systemds/scuro/representations/swin_video_transformer.py b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
index c46d12bcb56..39191f2f252 100644
--- a/src/main/python/systemds/scuro/representations/swin_video_transformer.py
+++ b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
@@ -30,6 +30,7 @@
 import numpy as np
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.operator_registry import register_representation
+from systemds.scuro.dataloader.video_loader import VideoStats
 
 from systemds.scuro.utils.torch_dataset import CustomDataset
 from systemds.scuro.utils.static_variables import (
@@ -41,6 +42,8 @@
 
 @register_representation([ModalityType.VIDEO])
 class SwinVideoTransformer(UnimodalRepresentation):
+    _EMBED_DIM = 768
+
     def __init__(self, layer_name="avgpool", params=None):
         parameters = {
             "layer_name": [
@@ -54,7 +57,7 @@ def __init__(self, layer_name="avgpool", params=None):
                 "avgpool",
             ],
         }
-        self.data_type = torch.float
+        self.data_type = torch.float32
         super().__init__("SwinVideoTransformer", ModalityType.EMBEDDING, parameters)
         self.layer_name = layer_name
         self.model = swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1)
@@ -65,7 +68,50 @@ def __init__(self, layer_name="avgpool", params=None):
             param.requires_grad = False
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
-        return RepresentationStats(input_stats.num_instances, (768,))
+        num_instances = getattr(input_stats, "num_instances", 0)
+        return RepresentationStats(num_instances, (self._EMBED_DIM,))
+
+    def estimate_output_memory_bytes(self, input_stats: VideoStats) -> int:
+        dt = int(torch.tensor([], dtype=self.data_type).element_size())
+        return input_stats.num_instances * self._EMBED_DIM * dt
+
+    def estimate_peak_memory_bytes(self, input_stats: VideoStats) -> dict:
+        dt = int(torch.tensor([], dtype=self.data_type).element_size())
+        temporal = max(input_stats.max_length, 1)
+        input_bytes = (
+            dt
+            * input_stats.max_channels
+            * temporal
+            * input_stats.max_height
+            * input_stats.max_width
+        )
+        output_bytes = self.estimate_output_memory_bytes(input_stats)
+        n = max(input_stats.num_instances, 1)
+        output_bytes_batch = output_bytes / n
+
+        batch_peak_bytes = (input_bytes + self._EMBED_DIM * dt) * 2
+
+        safety_margin_bytes = 100 * 1024 * 1024
+
+        param_size = 0
+        for param in self.model.parameters():
+            param_size += param.nelement() * param.element_size()
+
+        buffer_size = 0
+        for buffer in self.model.buffers():
+            buffer_size += buffer.nelement() * buffer.element_size()
+
+        size_all_bytes = param_size + buffer_size
+
+        cpu_peak = (
+            size_all_bytes * 2 * dt
+            + output_bytes_batch
+            + output_bytes
+            + input_bytes
+            + safety_margin_bytes
+        )
+        gpu_peak = (size_all_bytes * dt + batch_peak_bytes) * 6
+        return {"cpu_peak_bytes": int(cpu_peak), "gpu_peak_bytes": int(gpu_peak)}
 
     def transform(self, modality, aggregation=None):
         embeddings = {}
diff --git a/src/main/python/systemds/scuro/representations/vgg.py b/src/main/python/systemds/scuro/representations/vgg.py
index fa8f121438b..35bc07d8a29 100644
--- a/src/main/python/systemds/scuro/representations/vgg.py
+++ b/src/main/python/systemds/scuro/representations/vgg.py
@@ -21,6 +21,7 @@
 from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype
 from systemds.scuro.utils.torch_dataset import CustomDataset
 from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.dataloader.video_loader import VideoStats
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Tuple, Any
 from systemds.scuro.drsearch.operator_registry import register_representation
@@ -37,6 +38,11 @@
 from systemds.scuro.representations.representation import RepresentationStats
 
 
+class Identity(torch.nn.Module):
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return input_
+
+
 @register_representation([ModalityType.IMAGE, ModalityType.VIDEO])
 class VGG19(UnimodalRepresentation):
     def __init__(
@@ -58,10 +64,6 @@ def __init__(
         for param in self.model.parameters():
             param.requires_grad = False
 
-        class Identity(torch.nn.Module):
-            def forward(self, input_: torch.Tensor) -> torch.Tensor:
-                return input_
-
         self.model.fc = Identity()
 
     @property
@@ -88,9 +90,24 @@ def _get_parameters(self):
         return parameters
 
     def estimate_output_memory_bytes(self, input_stats: ImageStats) -> int:
+        if isinstance(input_stats, VideoStats):
+            return (
+                input_stats.num_instances
+                * input_stats.max_length
+                * 4096
+                * np.dtype(np.float32).itemsize
+            )
         return input_stats.num_instances * 4096 * np.dtype(np.float32).itemsize
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
+        if isinstance(input_stats, VideoStats):
+            return RepresentationStats(
+                input_stats.num_instances,
+                (
+                    input_stats.max_length,
+                    4096,
+                ),
+            )
         return RepresentationStats(input_stats.num_instances, (4096,))
 
     def estimate_peak_memory_bytes(self, input_stats: ImageStats) -> dict:
diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py
index 38dcb848436..ece034e099b 100644
--- a/src/main/python/systemds/scuro/representations/wav2vec.py
+++ b/src/main/python/systemds/scuro/representations/wav2vec.py
@@ -65,12 +65,42 @@ def transform(self, modality, aggregation=None):
                 outputs = self.model(**input)
                 features = outputs.extract_features
                 # TODO: check how to get intermediate representations
-            result.append(torch.flatten(features.mean(dim=1), 1).detach().cpu().numpy())
+            result.append(torch.flatten(features.mean(dim=1)).detach().cpu().numpy())
 
-        transformed_modality.data = result
+        transformed_modality.data = np.array(result)
         return transformed_modality
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
         num_instances = getattr(input_stats, "num_instances", 0)
-        embedding_dim = 768
+        embedding_dim = 512
         return RepresentationStats(num_instances, (embedding_dim,))
+
+    def estimate_peak_memory_bytes(self, input_stats) -> dict:
+        n = int(getattr(input_stats, "num_instances", 1))
+
+        if hasattr(input_stats, "max_length"):
+            signal_len = int(getattr(input_stats, "max_length", 16000))
+        elif hasattr(input_stats, "output_shape") and input_stats.output_shape:
+            signal_len = int(input_stats.output_shape[0])
+        else:
+            signal_len = 16000
+        signal_len = max(signal_len, 1)
+
+        hidden = 768
+        stride = 320  # conv frontend effective stride
+        frames = max(1, int(np.ceil(signal_len / stride)))
+
+        model_resident = 420 * 1024 * 1024  # ~420 MB
+        activation_bytes = int(frames * hidden * 4 * 24)
+        io_temp = int(signal_len * 4 * 4) + 16 * 1024 * 1024
+
+        output_bytes = n * 512 * 4
+
+        cpu_peak = int(
+            (model_resident + activation_bytes + io_temp + output_bytes) * 1.25
+        )
+
+        gpu_peak = 0
+        cpu_peak = max(cpu_peak, 600 * 1024 * 1024)
+
+        return {"cpu_peak_bytes": cpu_peak, "gpu_peak_bytes": gpu_peak}
diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py
index f7a70921532..ace4cf4b8ca 100644
--- a/src/main/python/systemds/scuro/representations/x3d.py
+++ b/src/main/python/systemds/scuro/representations/x3d.py
@@ -27,7 +27,7 @@
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.representation import RepresentationStats
-from typing import Tuple, Any
+from typing import Tuple, Any, Union
 import torch.utils.data
 import torch
 from torchvision.models.video import r3d_18, s3d
@@ -35,6 +35,13 @@
 import numpy as np
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.operator_registry import register_representation
+from systemds.scuro.dataloader.video_loader import VideoStats
+import math
+
+
+class Identity(torch.nn.Module):
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return input_
 
 
 @register_representation([ModalityType.VIDEO])
@@ -53,14 +60,52 @@ def __init__(
         for param in self.model.parameters():
             param.requires_grad = False
 
-        class Identity(torch.nn.Module):
-            def forward(self, input_: torch.Tensor) -> torch.Tensor:
-                return input_
-
         self.model.fc = Identity()
 
     def get_output_stats(self, input_stats) -> RepresentationStats:
-        return RepresentationStats(input_stats.num_instances, (512,))
+        embedding_dim = 400 * math.floor((max(input_stats.max_length, 14) - 5) / 8)
+        return RepresentationStats(input_stats.num_instances, (embedding_dim,))
+
+    def estimate_output_memory_bytes(self, input_stats: VideoStats) -> int:
+        embedding_dim = 400 * math.floor((max(input_stats.max_length, 14) - 5) / 8)
+        return input_stats.num_instances * embedding_dim * self.data_type.itemsize
+
+    def estimate_peak_memory_bytes(self, input_stats: VideoStats) -> dict:
+        temporal = max(input_stats.max_length, 14)
+        input_bytes = (
+            self.data_type.itemsize
+            * input_stats.max_channels
+            * temporal
+            * input_stats.max_height
+            * input_stats.max_width
+        )
+        output_bytes = self.estimate_output_memory_bytes(input_stats)
+        n = max(input_stats.num_instances, 1)
+        output_bytes_batch = output_bytes / n
+
+        batch_peak_bytes = (input_bytes + 512 * self.data_type.itemsize) * 2
+
+        safety_margin_bytes = 100 * 1024 * 1024
+
+        param_size = 0
+        for param in self.model.parameters():
+            param_size += param.nelement() * param.element_size()
+
+        buffer_size = 0
+        for buffer in self.model.buffers():
+            buffer_size += buffer.nelement() * buffer.element_size()
+
+        size_all_bytes = param_size + buffer_size
+
+        cpu_peak = (
+            size_all_bytes * 2 * self.data_type.itemsize
+            + output_bytes_batch
+            + output_bytes
+            + input_bytes
+            + safety_margin_bytes
+        )
+        gpu_peak = (size_all_bytes * self.data_type.itemsize + batch_peak_bytes) * 6
+        return {"cpu_peak_bytes": int(cpu_peak), "gpu_peak_bytes": int(gpu_peak)}
 
     @property
     def model_name(self):
@@ -85,6 +130,7 @@ def _get_parameters(self, high_level=True):
         for m in ["c3d", "s3d"]:
             parameters["model_name"].append(m)
 
+        # TODO: add embedding dimensions for each layer
         if high_level:
             parameters["layer_name"] = [
                 "features.1",
@@ -155,12 +201,10 @@ def hook(
             values = activation
             pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
 
-            embeddings[video_id].extend(
+            embeddings[video_id] = (
                 torch.flatten(pooled, 1).detach().cpu().numpy().flatten()
             )
 
-            embeddings[video_id] = np.array(embeddings[video_id])
-
         transformed_modality = TransformedModality(
             modality, self, self.output_modality_type
         )
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 14a373d98cf..a51ea510ea3 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -78,6 +78,7 @@ def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata
                 max(d.shape[1] for d in data),
                 max(d.shape[2] for d in data),
                 max(d.shape[3] for d in data),
+                chunk_size if chunk_size is not None else len(data),
                 len(data),
             )
         elif modality_type == ModalityType.TIMESERIES:
diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py
index 5fd22dc8d98..14ce9376be1 100644
--- a/src/main/python/tests/scuro/test_multimodal_join.py
+++ b/src/main/python/tests/scuro/test_multimodal_join.py
@@ -118,7 +118,9 @@ def _join(self, left_modality, right_modality, window_size):
             left_modality.join(
                 right_modality, JoinCondition("timestamp", "timestamp", "<")
             )
-            .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet18"))
+            .apply_representation(
+                ResNet(layer_name="layer1.0.conv2", model_name="ResNet18")
+            )
             .window_aggregation(window_size, "mean")
             .combine("concat")
         )
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index 3c5ce2a67f7..ad824b0335f 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -27,7 +27,8 @@
 from systemds.scuro.representations.color_histogram import ColorHistogram
 from systemds.scuro.drsearch.operator_registry import Registry
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
-
+from systemds.scuro.representations.mfcc import MFCC
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.representations.bert import Bert
@@ -48,7 +49,6 @@
 from systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
-from systemds.scuro.representations.bert import Bert
 from systemds.scuro.modality.type import ModalityType
 
 from unittest.mock import patch
@@ -67,7 +67,6 @@ def setUpClass(cls):
 
         cls.tasks = [
             TestTask("UnimodalRepresentationTask1", "Test1", cls.num_instances),
-            TestTask("UnimodalRepresentationTask2", "Test2", cls.num_instances),
         ]
 
     def test_unimodal_optimizer_for_text_modality(self):
@@ -111,6 +110,18 @@ def test_unimodal_optimizer_for_multiple_modalities(self):
         )
         self.optimize_unimodal_representation_for_modality([text, image])
 
+    def test_unimodal_optimizer_for_audio_modality(self):
+        audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+            self.num_instances, 3000
+        )
+        audio = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+            )
+        )
+
+        self.optimize_unimodal_representation_for_modality([audio])
+
     def test_unimodal_optimizer_for_video_modality(self):
         video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
             self.num_instances, 10, 10
@@ -194,7 +205,14 @@ def optimize_unimodal_representation_for_modality(self, modalities):
                     Bert,
                     CLIPText,
                 ],
-                ModalityType.VIDEO: [ResNet],
+                ModalityType.AUDIO: [
+                    MFCC,
+                    MelSpectrogram,
+                ],
+                ModalityType.VIDEO: [
+                    ResNet,
+                    CLIPVisual,
+                ],
                 ModalityType.IMAGE: [ColorHistogram, CLIPVisual],
                 ModalityType.EMBEDDING: [],
             },
@@ -216,7 +234,7 @@ def optimize_unimodal_representation_for_modality(self, modalities):
                     in unimodal_optimizer.operator_performance.modality_ids
                 )
 
-            assert len(unimodal_optimizer.operator_performance.task_names) == 2
+            assert len(unimodal_optimizer.operator_performance.task_names) == 1
             result, cached = unimodal_optimizer.operator_performance.get_k_best_results(
                 modalities[0], self.tasks[0], "accuracy"
             )
diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py
index bdc7af50b4c..a4e18743090 100644
--- a/src/main/python/tests/scuro/test_unimodal_representations.py
+++ b/src/main/python/tests/scuro/test_unimodal_representations.py
@@ -19,6 +19,7 @@
 #
 # -------------------------------------------------------------
 
+import time
 import unittest
 import copy
 import numpy as np
@@ -40,6 +41,7 @@
 from systemds.scuro.representations.glove import GloVe
 from systemds.scuro.representations.wav2vec import Wav2Vec
 from systemds.scuro.representations.spectrogram import Spectrogram
+from systemds.scuro.representations.window_aggregation import WindowAggregation
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.tfidf import TfIdf
 from systemds.scuro.representations.x3d import X3D
@@ -84,9 +86,54 @@ class TestUnimodalRepresentations(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.num_instances = 2
+        cls.num_instances = 100
         cls.indices = np.array(range(cls.num_instances))
 
+    def _create_audio_modality(self, signal_length=1000):
+        audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
+            self.num_instances, signal_length
+        )
+
+        audio = UnimodalModality(
+            TestDataLoader(
+                self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md
+            )
+        )
+        audio.extract_raw_data()
+        return audio
+
+    def test_audio_representation_transform_output_shapes(self):
+        audio = self._create_audio_modality()
+        audio_representations = [
+            (MFCC(), (2, 12)),
+            (MelSpectrogram(), (2, 128)),
+            (Spectrogram(), (2, 1025)),
+            (Wav2Vec(), (1, None)),
+            (Spectral(), (2, 4)),
+            (ZeroCrossing(), (2, None)),
+            (RMSE(), (2, None)),
+            (Pitch(), (2, None)),
+        ]
+
+        for representation, expected_shape_signature in audio_representations:
+            with self.subTest(representation=representation.name):
+                transformed_modality = representation.transform(audio)
+                print(representation.name)
+                self.assertIsNotNone(transformed_modality.data)
+                self.assertEqual(len(transformed_modality.data), self.num_instances)
+
+                for transformed_instance in transformed_modality.data:
+                    self.assertEqual(
+                        transformed_instance.ndim,
+                        expected_shape_signature[0],
+                    )
+                    if expected_shape_signature[1] is not None:
+                        self.assertEqual(
+                            transformed_instance.shape[1],
+                            expected_shape_signature[1],
+                        )
+                    self.assertGreater(transformed_instance.shape[0], 0)
+
     def test_audio_representations(self):
         audio_representations = [
             MFCC(),
@@ -117,7 +164,6 @@ def test_audio_representations(self):
             assert len(r.data) == self.num_instances
             for i in range(self.num_instances):
                 assert (audio.data[i] == original_data[i]).all()
-            assert r.data[0].ndim == 2
 
     def test_timeseries_representations(self):
         ts_representations = [
@@ -173,27 +219,27 @@ def test_image_representations(self):
             assert r.data is not None
             assert len(r.data) == self.num_instances
 
-    def test_video_representations(self):
-        video_representations = [
-            CLIPVisual(),
-            I3D(),
-            X3D(),
-            VGG19(),
-            ResNet(),
-            SwinVideoTransformer(),
-        ]
-        video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
-            self.num_instances, 25
-        )
-        video = UnimodalModality(
-            TestDataLoader(
-                self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md
-            )
-        )
-        for representation in video_representations:
-            r = video.apply_representation(representation)
-            assert r.data is not None
-            assert len(r.data) == self.num_instances
+    # def test_video_representations(self):
+    #     video_representations = [
+    #         CLIPVisual(layer_name="post_layernorm"),
+    #         I3D(),
+    #         X3D(),
+    #         VGG19(),
+    #         ResNet(),
+    #         SwinVideoTransformer(),
+    #     ]
+    #     video_data, video_md = ModalityRandomDataGenerator().create_visual_modality(
+    #         self.num_instances, 25
+    #     )
+    #     video = UnimodalModality(
+    #         TestDataLoader(
+    #             self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md
+    #         )
+    #     )
+    #     for representation in video_representations:
+    #         r = video.apply_representation(representation)
+    #         assert r.data is not None
+    #         assert len(r.data) == self.num_instances
 
     def test_text_representations(self):
         test_representations = [
diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py
index 1f954cfeb04..2eaf5985db1 100644
--- a/src/main/python/tests/scuro/test_window_operations.py
+++ b/src/main/python/tests/scuro/test_window_operations.py
@@ -93,7 +93,7 @@ def test_window_operations_on_text_representations(self):
         self.run_window_aggregation_for_modality(ModalityType.TEXT, window_size)
 
     def run_window_aggregation_for_modality(self, modality_type, window_size):
-        r = self.data_generator.create1DModality(40, 100, modality_type)
+        r = self.data_generator.create1DModality(40, 5000, modality_type)
         for aggregation in self.aggregations:
             windowed_modality = r.window_aggregation(window_size, aggregation)