From 6e0c9b700c46a9a6801f4c4e99644c9000fbd6ec Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 11:04:09 +0200 Subject: [PATCH 01/17] fix(STT): whisper call was blocking entire huri's loop --- src/modules/speech_to_text/speech_to_text.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 1300dd3..fdc68f0 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -41,7 +41,7 @@ def __init__( ): super().__init__() - self.model_faster = WhisperModel(model) + self.model_faster = WhisperModel(model, cpu_threads=2) self.language = language self.sample_rate = sample_rate @@ -83,13 +83,15 @@ async def process(self, voice: Voice) -> Optional[Transcript]: self.pending_silence = False processing_audio = np.concatenate(processing_chunks, axis=0) - segments, _ = self.model_faster.transcribe( - processing_audio, - language=self.language, - beam_size=1, # faster for realtime - ) + def transcribe_text(): + segments, _ = self.model_faster.transcribe( + processing_audio, + language=self.language, + beam_size=1, + ) + return " ".join(seg.text for seg in segments).strip() - current_text = " ".join([seg.text for seg in segments]).strip() + current_text = await asyncio.to_thread(transcribe_text) processed_size = self.window_size - self.step_size async with self.lock: From d72b1c369b3ff8729d701435edfd789e77f36255 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 17:19:19 +0200 Subject: [PATCH 02/17] feat(client): added ClientHook abstract class --- src/core/client.py | 119 ++++++++++++++++++++++++++++++++----- src/core/client_senders.py | 102 ------------------------------- 2 files changed, 105 insertions(+), 116 deletions(-) delete mode 100644 src/core/client_senders.py diff --git a/src/core/client.py b/src/core/client.py index 085a0b8..4170e70 100644 --- a/src/core/client.py +++ b/src/core/client.py @@ -1,14 +1,79 @@ import asyncio +import importlib import json import os +import struct +from collections import defaultdict from dataclasses import asdict -from typing import Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Type import websockets from src.core.dataclasses.config import ClientConfig +from src.core.events import EventData -from .client_senders import ClientSender, get_senders +from .interface import Interface + + +class ClientSender: + """This class abstract sending data to HuRI. + + output_type: is the event data structure that the ClientSender will send. + It can be EventData or bytes, and must match event topic it send. + + Class derived from ClientSender must implement input_loop, + and use ClientSender.send to send data to HuRI. + """ + + output_type: Type[EventData] | bytes + + def __init__(self, topic: str, **_): + self.topic = topic + if issubclass(self.output_type, EventData): + self.send_function = self._send_event_data + elif issubclass(self.output_type, bytes): + self.send_function = self._send_bytes + else: + raise RuntimeError(f"{self.output_type} should be inherited from \ +EventData or bytes") + + async def input_loop(self, ws: websockets.ClientConnection): + raise NotImplementedError + + async def _send_bytes(self, ws: websockets.ClientConnection, data: bytes): + topic_bytes = self.topic.encode() + packet = struct.pack("!H", len(topic_bytes)) + topic_bytes + data + + await ws.send(packet) + + async def _send_event_data(self, ws: websockets.ClientConnection, data: EventData): + packet = json.dumps({"topic": self.topic, "data": asdict(data)}) + + await ws.send(packet) + + async def send(self, ws: websockets.ClientConnection, data: EventData | bytes): + await self.send_function(ws, data) + + +class ClientHook: + """This class abstract processing data from HuRI. + + input_type: is the event data structure that the ClientHook will process. + It can be EventData or bytes, and must match event topic it react to. + + Class derived from ClientHook must implement hook. + + `singletton` allow hooks to modifies shared ressources, + and comes from the used interface. + """ + + input_type: Type[EventData] | bytes + + def __init__(self, **_): + pass + + async def hook(self, singletton: Any, data: EventData | bytes): + raise NotImplementedError class Client: @@ -18,11 +83,29 @@ def __init__( self, config: ClientConfig, user_id_file: str = os.path.expanduser("~/.huri_user_id"), - senders_dict: Dict[str, Type[ClientSender]] = get_senders(), ): self.config = config + + module_path, object_name = self.config.interface_path.split(":", 1) + + module = importlib.import_module(module_path) + interface: Interface = getattr(module, object_name) + + self.singletton = interface.singletton + + available_senders = interface.get_senders() + self.senders: List[ClientSender] = [ + available_senders[sender.name](topic=sender.topic, **sender.args) + for sender in self.config.senders.values() + ] + + available_hooks = interface.get_hooks() + self.hooks: Dict[str, List[ClientHook]] = defaultdict(list) + for hook in self.config.hooks.values(): + for topic in hook.topics: + self.hooks[topic].append(available_hooks[hook.name](**hook.args)) + self.user_id_file = user_id_file - self.senders_dict = senders_dict def _load_user_id(self) -> Optional[str]: if os.path.exists(self.user_id_file): @@ -37,9 +120,22 @@ def _save_user_id(self, _user_id: str): async def _receive_loop(self, ws: websockets.ClientConnection): try: while True: - text = await ws.recv() - print("<<", text) - await asyncio.sleep(0.1) + msg = await ws.recv() + + if isinstance(msg, bytes): + topic_len = struct.unpack("!H", msg[:2])[0] + + topic = msg[2 : 2 + topic_len].decode() + data = msg[2 + topic_len :] + else: + event = json.loads(msg) + topic = event["topic"] + data = event["data"] + + for hook in self.hooks[topic]: + if not issubclass(hook.input_type, bytes): + data = hook.input_type(**data) + asyncio.create_task(hook.hook(self.singletton, data)) except (asyncio.CancelledError, websockets.ConnectionClosedOK): pass @@ -50,11 +146,6 @@ async def run(self): self.config.user_id = self._load_user_id() - senders: List[ClientSender] = [ - self.senders_dict[config.name](ws=ws, **config.args) - for config in self.config.senders.values() - ] - await ws.send(json.dumps(asdict(self.config))) init_msg = json.loads(await ws.recv()) @@ -63,9 +154,9 @@ async def run(self): self._save_user_id(user_id) print(f"Session started with _user_id: {user_id}") - receive_task = asyncio.create_task(self._receive_loop(ws)) + receive_task = asyncio.create_task(self._receive_loop(ws=ws)) await asyncio.gather( - *(sender.input_loop() for sender in senders), + *(sender.input_loop(ws=ws) for sender in self.senders), ) receive_task.cancel() diff --git a/src/core/client_senders.py b/src/core/client_senders.py deleted file mode 100644 index 03301a6..0000000 --- a/src/core/client_senders.py +++ /dev/null @@ -1,102 +0,0 @@ -import asyncio -import json -import struct -from dataclasses import asdict -from typing import Dict, Type - -import numpy as np -import sounddevice as sd -import websockets -from prompt_toolkit import PromptSession -from prompt_toolkit.patch_stdout import patch_stdout - -from src.core.events import EventData -from src.modules.speech_to_text.events import Sentence - - -class ClientSender: - """This class abstract sending data to HuRI. - - output_type: is the topic that the ClientSender will send. - Data structure must match event topic. - - Class derived from ClientSender must implement input_loop, - and use ClientSender.send to send data to HuRI. It can be EventData or bytes - """ - - output_type: str - - def __init__(self, ws: websockets.ClientConnection): - self.ws = ws - - async def input_loop(self): - raise NotImplementedError - - async def send(self, topic: str, data: EventData | bytes): - packet: str | bytes - if isinstance(data, EventData): - packet = json.dumps({"topic": topic, "data": asdict(data)}) - else: - topic_bytes = topic.encode() - - packet = struct.pack("!H", len(topic_bytes)) + topic_bytes + data - - await self.ws.send(packet) - - -class AudioSender(ClientSender): - output_type = "audio" - - def __init__( - self, sample_rate: int = 16000, frame_duration: float = 0.030, **kwargs - ): - super().__init__(**kwargs) - - self.sample_rate = sample_rate - self.frame_size = int(sample_rate * frame_duration) - - async def input_loop(self): - loop = asyncio.get_running_loop() - - queue: asyncio.Queue[np.ndarray] = asyncio.Queue() - - def callback(indata: np.ndarray, frames, time, status): - loop.call_soon_threadsafe(queue.put_nowait, indata.copy()) - - with sd.InputStream( - samplerate=self.sample_rate, - channels=1, - dtype="int16", - callback=callback, - blocksize=self.frame_size, - ): - while True: - chunk = await queue.get() - await self.send(self.output_type, chunk.tobytes()) - - -class TextSender(ClientSender): - output_type = "question" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - async def input_loop(self): - print("'\\exit' or CTRL+D/C to exit.") - session: PromptSession = PromptSession() - try: - while True: - with patch_stdout(): - text = await session.prompt_async(">> ") - if text == "\\exit": - return - await self.send(self.output_type, Sentence(text)) - - except (EOFError, KeyboardInterrupt): - pass - finally: - print("TextSender Exited...") - - -def get_senders() -> Dict[str, Type[ClientSender]]: - return {"audio": AudioSender, "text": TextSender} From eafd64d51503f6100bbfb8f5368c1aa354d12c15 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:07:57 +0200 Subject: [PATCH 03/17] evol(client): better typing for event --- src/core/client.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/core/client.py b/src/core/client.py index 4170e70..cae9dfa 100644 --- a/src/core/client.py +++ b/src/core/client.py @@ -5,17 +5,17 @@ import struct from collections import defaultdict from dataclasses import asdict -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, Generic, List, Optional, Type, TypeVar import websockets from src.core.dataclasses.config import ClientConfig from src.core.events import EventData -from .interface import Interface +T = TypeVar("T", bound=EventData | bytes) -class ClientSender: +class ClientSender(Generic[T]): """This class abstract sending data to HuRI. output_type: is the event data structure that the ClientSender will send. @@ -25,17 +25,10 @@ class ClientSender: and use ClientSender.send to send data to HuRI. """ - output_type: Type[EventData] | bytes + output_type: Type[T] def __init__(self, topic: str, **_): self.topic = topic - if issubclass(self.output_type, EventData): - self.send_function = self._send_event_data - elif issubclass(self.output_type, bytes): - self.send_function = self._send_bytes - else: - raise RuntimeError(f"{self.output_type} should be inherited from \ -EventData or bytes") async def input_loop(self, ws: websockets.ClientConnection): raise NotImplementedError @@ -51,11 +44,14 @@ async def _send_event_data(self, ws: websockets.ClientConnection, data: EventDat await ws.send(packet) - async def send(self, ws: websockets.ClientConnection, data: EventData | bytes): - await self.send_function(ws, data) + async def send(self, ws: websockets.ClientConnection, data: T): + if isinstance(data, bytes): + await self._send_bytes(ws, data) + else: + await self._send_event_data(ws, data) -class ClientHook: +class ClientHook(Generic[T]): """This class abstract processing data from HuRI. input_type: is the event data structure that the ClientHook will process. @@ -67,12 +63,12 @@ class ClientHook: and comes from the used interface. """ - input_type: Type[EventData] | bytes + input_type: Type[T] def __init__(self, **_): pass - async def hook(self, singletton: Any, data: EventData | bytes): + async def hook(self, singletton: Any, data: T): raise NotImplementedError @@ -89,7 +85,7 @@ def __init__( module_path, object_name = self.config.interface_path.split(":", 1) module = importlib.import_module(module_path) - interface: Interface = getattr(module, object_name) + interface = getattr(module, object_name) self.singletton = interface.singletton @@ -133,7 +129,7 @@ async def _receive_loop(self, ws: websockets.ClientConnection): data = event["data"] for hook in self.hooks[topic]: - if not issubclass(hook.input_type, bytes): + if not isinstance(data, bytes): data = hook.input_type(**data) asyncio.create_task(hook.hook(self.singletton, data)) From 32f990faeb4b328e41b4d29462d6164de605aaff Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:09:18 +0200 Subject: [PATCH 04/17] evol(Sender): send topic and data --- src/modules/utils/sender.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/modules/utils/sender.py b/src/modules/utils/sender.py index f09b0ba..a9fc2fa 100644 --- a/src/modules/utils/sender.py +++ b/src/modules/utils/sender.py @@ -1,3 +1,4 @@ +import struct from dataclasses import asdict from fastapi import WebSocket @@ -23,8 +24,8 @@ def __init__(self, ws: WebSocket, type: str): async def process(self, data: EventData | bytes): if isinstance(data, bytes): - await self.ws.send_bytes(data) - elif isinstance(data, EventData): - await self.ws.send_json(asdict(data)) + topic_bytes = self.input_type.encode() + packet = struct.pack("!H", len(topic_bytes)) + topic_bytes + data + await self.ws.send_bytes(packet) else: - await self.ws.send_text(data) + await self.ws.send_json({"topic": self.input_type, "data": asdict(data)}) From d078a628459d747acd0a84dd9adbc8b9364de500 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:10:09 +0200 Subject: [PATCH 05/17] feat(Interface): abstract class to define specific Client sender and hooks --- src/core/interface.py | 22 ++++++++++++++++++++++ src/interfaces/__init__.py | 0 2 files changed, 22 insertions(+) create mode 100644 src/core/interface.py create mode 100644 src/interfaces/__init__.py diff --git a/src/core/interface.py b/src/core/interface.py new file mode 100644 index 0000000..fb2b7ac --- /dev/null +++ b/src/core/interface.py @@ -0,0 +1,22 @@ +from typing import Any, Dict, Type + +from .client import ClientHook, ClientSender + + +class Interface: + """This class abstract defining specific Client senders and hooks. + + `self.singletton`: allow hooks to modifies shared ressources, + and comes from the used interface. + + Class derived from Interface must implement get_senders and get_hooks. + """ + + def __init__(self, singletton: Any): + self.singletton = singletton + + def get_senders(self) -> Dict[str, Type[ClientSender]]: + raise NotImplementedError + + def get_hooks(self) -> Dict[str, Type[ClientHook]]: + raise NotImplementedError diff --git a/src/interfaces/__init__.py b/src/interfaces/__init__.py new file mode 100644 index 0000000..e69de29 From 349372d32ff18cd59e8b544b0ddf1cbf7b5495a7 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:11:17 +0200 Subject: [PATCH 06/17] feat(config): ClientHookConfig + Interface path + modified topic_list --- src/core/dataclasses/config.py | 31 +++++++++++++++++++++++++++---- src/core/huri.py | 11 +++++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/core/dataclasses/config.py b/src/core/dataclasses/config.py index aea111f..f515026 100644 --- a/src/core/dataclasses/config.py +++ b/src/core/dataclasses/config.py @@ -15,15 +15,32 @@ def from_dict(self, raw: dict) -> "ModuleConfig": ) +@dataclass +class ClientHookConfig: + name: str + topics: List[str] + args: Mapping[str, Any] + + @classmethod + def from_dict(self, raw: dict) -> "ClientHookConfig": + return self( + name=raw["name"], + topics=raw["topics"], + args=raw.get("args", {}), + ) + + @dataclass class ClientSenderConfig: name: str + topic: str args: Mapping[str, Any] @classmethod def from_dict(self, raw: dict) -> "ClientSenderConfig": return self( name=raw["name"], + topic=raw["topic"], args=raw.get("args", {}), ) @@ -32,15 +49,20 @@ def from_dict(self, raw: dict) -> "ClientSenderConfig": class ClientConfig: user_id: Optional[str] huri_url: str - topic_list: List[str] + interface_path: str + hooks: Dict[str, ClientHookConfig] senders: Dict[str, ClientSenderConfig] modules: Dict[str, ModuleConfig] @classmethod def from_dict(cls, raw: Dict) -> "ClientConfig": + hooks = { + hook_id: ClientHookConfig.from_dict(hok_raw) + for hook_id, hok_raw in raw.get("hooks", {}).items() + } senders = { - sender_id: ClientSenderConfig.from_dict(mod_raw) - for sender_id, mod_raw in raw.get("senders", {}).items() + sender_id: ClientSenderConfig.from_dict(snd_raw) + for sender_id, snd_raw in raw.get("senders", {}).items() } modules = { module_id: ModuleConfig.from_dict(mod_raw) @@ -49,7 +71,8 @@ def from_dict(cls, raw: Dict) -> "ClientConfig": return cls( user_id=None, huri_url=raw["huri_url"], - topic_list=raw["topic_list"], + interface_path=raw["interface_path"], + hooks=hooks, senders=senders, modules=modules, ) diff --git a/src/core/huri.py b/src/core/huri.py index 5fa8038..f5e4eeb 100644 --- a/src/core/huri.py +++ b/src/core/huri.py @@ -32,7 +32,7 @@ def __init__( self, modules: Dict[str, Type[Module]], handles: Dict[str, handle.DeploymentHandle], - events: Dict[str, Type[EventData]], + events: Dict[str, Type[EventData | bytes]], ) -> None: self.module_factory = ModuleFactory(handles) self.event_factory = EventDataFactory() @@ -80,9 +80,12 @@ async def run_session(self, ws: WebSocket): user_id = client_config_raw.get("user_id") or str(uuid.uuid4()) - senders: List[Module] = [ - Sender(ws, topic) for topic in client_config.topic_list + topic_list = [ + topic + for hook_config in client_config.hooks.values() + for topic in hook_config.topics ] + senders: List[Module] = [Sender(ws, topic) for topic in topic_list] modules: List[Module] = ( self.module_factory.create_from_config(user_id, client_config.modules) + senders @@ -112,7 +115,7 @@ async def receive_loop(session: Session, ws: WebSocket): msg_text = msg["text"] event = json.loads(msg_text) topic = event["topic"] - data = event["data"] + data = event["data"] # TODO client/server one function data = self.event_factory.create(topic, data) From f1d112d9ff4b50a39ce213627ad3b8a64ed11d00 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:11:46 +0200 Subject: [PATCH 07/17] feat(interface): added cli_interface for cli use --- src/interfaces/cli_interface.py | 123 +++++++++++++++++++ src/modules/speech_to_text/speech_to_text.py | 2 +- 2 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 src/interfaces/cli_interface.py diff --git a/src/interfaces/cli_interface.py b/src/interfaces/cli_interface.py new file mode 100644 index 0000000..c07469f --- /dev/null +++ b/src/interfaces/cli_interface.py @@ -0,0 +1,123 @@ +import asyncio +from typing import Dict, Type + +import numpy as np +import sounddevice as sd +from prompt_toolkit import PromptSession +from prompt_toolkit.patch_stdout import patch_stdout +from scipy.signal import resample + +from src.core.client import ClientHook, ClientSender +from src.core.interface import Interface +from src.modules.speech_to_text.events import Sentence + + +class AudioSender(ClientSender[bytes]): + def __init__( + self, sample_rate: int = 16000, frame_duration: float = 0.030, **kwargs + ): + super().__init__(**kwargs) + + self.sample_rate = sample_rate + self.frame_size = int(sample_rate * frame_duration) + + async def input_loop(self, ws): + loop = asyncio.get_running_loop() + + queue: asyncio.Queue[np.ndarray] = asyncio.Queue() + + def callback(indata: np.ndarray, frames, time, status): + loop.call_soon_threadsafe(queue.put_nowait, indata.copy()) + + with sd.InputStream( + samplerate=self.sample_rate, + channels=1, + dtype="int16", + callback=callback, + blocksize=self.frame_size, + ): + while True: + chunk = await queue.get() + await self.send(ws, chunk.tobytes()) + + +class TextSender(ClientSender[Sentence]): + output_type = Sentence + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def input_loop(self, ws): + print("'\\exit' or CTRL+D/C to exit.") + session: PromptSession = PromptSession() + try: + while True: + with patch_stdout(): + text = await session.prompt_async(">> ") + if text == "\\exit": + return + await self.send(ws, Sentence(text)) + + except (EOFError, KeyboardInterrupt): + pass + finally: + print("TextSender Exited...") + + +class AudioHook(ClientHook[bytes]): + input_type = bytes + + def __init__(self, sample_rate=48000, incoming_sample_rate=16000, **kwargs): + super().__init__(**kwargs) + + print("Speaker:", sd.query_devices(kind="output")) + + self.incoming_sample_rate = incoming_sample_rate + self.sample_rate = sample_rate + self.stream = sd.OutputStream( + samplerate=sample_rate, + channels=1, + dtype="int16", + ) + self.stream.start() + + self.resample_function = ( + self._resample if sample_rate != incoming_sample_rate else lambda x: x + ) + + def _resample(self, audio: np.ndarray): + return resample( + audio, + int(len(audio) * self.sample_rate / self.incoming_sample_rate), + ).astype(np.int16) + + async def hook(self, singletton: None, data: bytes): + audio = np.frombuffer(data, dtype=np.int16) + + audio = self.resample_function(audio) + + self.stream.write(audio.reshape(-1, 1)) + + +class TextHook(ClientHook[Sentence]): + input_type = Sentence + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + async def hook(self, singletton: None, data: Sentence): + print("<<", data.text) + + +class CLIInterface(Interface): + def __init__(self): + super().__init__(singletton=None) + + def get_senders(self) -> Dict[str, Type[ClientSender]]: + return {"audio": AudioSender, "text": TextSender} + + def get_hooks(self) -> Dict[str, Type[ClientHook]]: + return {"audio": AudioHook, "text": TextHook} + + +cli_interface = CLIInterface() diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index fdc68f0..63bf060 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -41,7 +41,7 @@ def __init__( ): super().__init__() - self.model_faster = WhisperModel(model, cpu_threads=2) + self.model_faster = WhisperModel(model) self.language = language self.sample_rate = sample_rate From 88119f62ce11512c3ee01f6830aacc765ac42087 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:32:07 +0200 Subject: [PATCH 08/17] evol(interface): cli TextHook is for RAGResult event type --- src/interfaces/cli_interface.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/interfaces/cli_interface.py b/src/interfaces/cli_interface.py index c07469f..03cacbf 100644 --- a/src/interfaces/cli_interface.py +++ b/src/interfaces/cli_interface.py @@ -9,6 +9,7 @@ from src.core.client import ClientHook, ClientSender from src.core.interface import Interface +from src.modules.rag.events import RAGResult from src.modules.speech_to_text.events import Sentence @@ -99,14 +100,14 @@ async def hook(self, singletton: None, data: bytes): self.stream.write(audio.reshape(-1, 1)) -class TextHook(ClientHook[Sentence]): - input_type = Sentence +class TextHook(ClientHook[RAGResult]): + input_type = RAGResult def __init__(self, **kwargs): super().__init__(**kwargs) - async def hook(self, singletton: None, data: Sentence): - print("<<", data.text) + async def hook(self, singletton: None, data: RAGResult): + print("<<", data.answer) class CLIInterface(Interface): From ac292b680b5d897d776b7c80e5975a5f30986f0a Mon Sep 17 00:00:00 2001 From: Popochounet Date: Mon, 1 Jun 2026 18:32:31 +0200 Subject: [PATCH 09/17] evol(config): yaml files with new config --- config/{client_aux2.yaml => client_aux .yaml} | 22 +++++++++++++-- config/client_aux.yaml | 28 ------------------- config/client_auxio.yaml | 25 ----------------- config/client_template.yaml | 21 ++++++++++++-- config/client_text.yaml | 8 +++++- 5 files changed, 44 insertions(+), 60 deletions(-) rename config/{client_aux2.yaml => client_aux .yaml} (55%) delete mode 100644 config/client_aux.yaml delete mode 100644 config/client_auxio.yaml diff --git a/config/client_aux2.yaml b/config/client_aux .yaml similarity index 55% rename from config/client_aux2.yaml rename to config/client_aux .yaml index 7d7b601..d1f5195 100644 --- a/config/client_aux2.yaml +++ b/config/client_aux .yaml @@ -1,13 +1,31 @@ huri_url: ws://localhost:8000/session -topic_list: [transcript, question, rag_response] +interface_path: src.interfaces.cli_interface:cli_interface senders: audio: name: audio + topic: audio args: sample_rate: 16000 frame_duration: 0.030 + text: + name: text + topic: question + args: + sample_rate: 16000 + frame_duration: 0.030 + +hooks: + text: + name: text + topics: [question, answer] + audio: + name: audio + topics: [audio] + args: + incoming_sample_rate: ${senders.audio.args.sample_rate} + sample_rate: 44100 modules: mic: @@ -21,10 +39,8 @@ modules: args: language: en block_duration: ${senders.audio.args.frame_duration} - logging: INFO tag: name: tag - logging: INFO rag: name: rag args: diff --git a/config/client_aux.yaml b/config/client_aux.yaml deleted file mode 100644 index fe3e332..0000000 --- a/config/client_aux.yaml +++ /dev/null @@ -1,28 +0,0 @@ -huri_url: ws://localhost:8000/session - -topic_list: [question] - -senders: - audio: - name: audio - args: - sample_rate: 16000 - frame_duration: 0.030 - -modules: - mic: - name: mic - args: - vad_agressiveness: 3 - silence_duration: 1.5 - block_duration: ${inputs.audio.args.frame_duration} - logging: INFO - stt: - name: stt - args: - language: "en" - block_duration: ${inputs.audio.args.frame_duration} - logging: INFO - tag: - name: tag - logging: INFO diff --git a/config/client_auxio.yaml b/config/client_auxio.yaml deleted file mode 100644 index 8fa2a91..0000000 --- a/config/client_auxio.yaml +++ /dev/null @@ -1,25 +0,0 @@ -huri_url: ws://localhost:8000/session - -topic_list: [question] - -senders: - text: - name: text - -modules: - mic: - name: mic - args: - vad_agressiveness: 3 - silence_duration: 1.5 - block_duration: ${senders.audio.args.frame_duration} - logging: INFO - stt: - name: stt - args: - language: en - block_duration: ${senders.audio.args.frame_duration} - logging: INFO - tag: - name: tag - logging: INFO diff --git a/config/client_template.yaml b/config/client_template.yaml index cf1627d..441f3c5 100644 --- a/config/client_template.yaml +++ b/config/client_template.yaml @@ -1,19 +1,34 @@ # HuRI websocket server url huri_url: ws://localhost:8000/session -# List of event topic the client will receive -topic_list: [topic1, topic2] +# Define interface to be used's import path +interface_path: src.interfaces.cli_interface:cli_interface # Define senders to be used and their custom args senders: # sender tag can be anything example: - # sender name must be in the list of available ClientSender in Client instance (src.client_sender:get_senders) + # sender name must be in the list of available ClientSender in chosen Interface (Interface.get_senders) name: my_sender + # topic the sender will send to HuRI, it must match output_type event data structure + topic: my_event # if my_sender init with "model", "sample_rate" and "refresh_rate" params, they can be customized here args: refresh_rate: infinite +# Define hooks to be used and their custom args +hooks: + # hook tag can be anything + example: + # hook name must be in the list of available ClientHook in chosen Interface (Interface.get_senders) + name: my_hook + # topics the hook will process from HuRI, it must match input_type event data structure + topics: [my_event, llm_response] + # if my_hook init with "model", "sample_rate" and "refresh_rate" params, they can be customized here + args: + sample_rate: 0 + no: beat + # Define module to be used and their custom args modules: # module tag can be anything diff --git a/config/client_text.yaml b/config/client_text.yaml index 8ddcaab..d2fb26f 100644 --- a/config/client_text.yaml +++ b/config/client_text.yaml @@ -1,10 +1,16 @@ huri_url: ws://localhost:8000/session -topic_list: [question, rag_response] +interface_path: src.interfaces.cli_interface:cli_interface senders: text: name: text + topic: question + +hooks: + text: + name: text + topics: [rag_response] modules: rag: From 87210a74979cc0e3d875e6e32386be4b43ce936e Mon Sep 17 00:00:00 2001 From: Popochounet Date: Tue, 2 Jun 2026 06:27:50 +0200 Subject: [PATCH 10/17] evol(client): move singletton to __init__ for senders and hooks --- src/core/client.py | 31 ++++++++++++++++++++----------- src/interfaces/cli_interface.py | 4 ++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/core/client.py b/src/core/client.py index cae9dfa..6927565 100644 --- a/src/core/client.py +++ b/src/core/client.py @@ -23,12 +23,18 @@ class ClientSender(Generic[T]): Class derived from ClientSender must implement input_loop, and use ClientSender.send to send data to HuRI. + + `singletton` is available to access shared ressources. """ output_type: Type[T] - def __init__(self, topic: str, **_): + def __init__(self, topic: str, singletton: Any, **_): + """ + :topic: topic sent to HuRI + :singletton: allow to get shared ressources""" self.topic = topic + self.singletton = singletton async def input_loop(self, ws: websockets.ClientConnection): raise NotImplementedError @@ -59,16 +65,15 @@ class ClientHook(Generic[T]): Class derived from ClientHook must implement hook. - `singletton` allow hooks to modifies shared ressources, - and comes from the used interface. + `singletton` is available to access and modifies shared ressources. """ input_type: Type[T] - def __init__(self, **_): - pass + def __init__(self, singletton: Any, **_): + self.singletton = singletton - async def hook(self, singletton: Any, data: T): + async def hook(self, data: T): raise NotImplementedError @@ -87,11 +92,11 @@ def __init__( module = importlib.import_module(module_path) interface = getattr(module, object_name) - self.singletton = interface.singletton - available_senders = interface.get_senders() self.senders: List[ClientSender] = [ - available_senders[sender.name](topic=sender.topic, **sender.args) + available_senders[sender.name]( + topic=sender.topic, singletton=interface.singletton, **sender.args + ) for sender in self.config.senders.values() ] @@ -99,7 +104,11 @@ def __init__( self.hooks: Dict[str, List[ClientHook]] = defaultdict(list) for hook in self.config.hooks.values(): for topic in hook.topics: - self.hooks[topic].append(available_hooks[hook.name](**hook.args)) + self.hooks[topic].append( + available_hooks[hook.name]( + singletton=interface.singletton, **hook.args + ) + ) self.user_id_file = user_id_file @@ -131,7 +140,7 @@ async def _receive_loop(self, ws: websockets.ClientConnection): for hook in self.hooks[topic]: if not isinstance(data, bytes): data = hook.input_type(**data) - asyncio.create_task(hook.hook(self.singletton, data)) + asyncio.create_task(hook.hook(data)) except (asyncio.CancelledError, websockets.ConnectionClosedOK): pass diff --git a/src/interfaces/cli_interface.py b/src/interfaces/cli_interface.py index 03cacbf..2634d07 100644 --- a/src/interfaces/cli_interface.py +++ b/src/interfaces/cli_interface.py @@ -92,7 +92,7 @@ def _resample(self, audio: np.ndarray): int(len(audio) * self.sample_rate / self.incoming_sample_rate), ).astype(np.int16) - async def hook(self, singletton: None, data: bytes): + async def hook(self, data: bytes): audio = np.frombuffer(data, dtype=np.int16) audio = self.resample_function(audio) @@ -106,7 +106,7 @@ class TextHook(ClientHook[RAGResult]): def __init__(self, **kwargs): super().__init__(**kwargs) - async def hook(self, singletton: None, data: RAGResult): + async def hook(self, data: RAGResult): print("<<", data.answer) From 2f70d427d71a2fdf5eec267962ad6190947740f5 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 09:54:56 +0200 Subject: [PATCH 11/17] feat(emotion): prosody analysis module (EMO) --- src/modules/emotion/events.py | 12 +++ src/modules/emotion/prosody_analysis.py | 104 +++++++++++++++++++ src/modules/events.py | 2 + src/modules/modules.py | 3 +- src/modules/speech_to_text/speech_to_text.py | 4 - 5 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 src/modules/emotion/events.py create mode 100644 src/modules/emotion/prosody_analysis.py diff --git a/src/modules/emotion/events.py b/src/modules/emotion/events.py new file mode 100644 index 0000000..4c29c37 --- /dev/null +++ b/src/modules/emotion/events.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass +from typing import Dict + +from src.core.events import EventData + + +@dataclass +class Emotion(EventData): + label: str + confidence: float + scores: Dict[str, float] + end: bool diff --git a/src/modules/emotion/prosody_analysis.py b/src/modules/emotion/prosody_analysis.py new file mode 100644 index 0000000..8510330 --- /dev/null +++ b/src/modules/emotion/prosody_analysis.py @@ -0,0 +1,104 @@ +import asyncio +from typing import List, Optional + +import numpy as np +import torch +from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor + +from src.core.module import Module +from src.modules.speech_to_text.events import Voice + +from .events import Emotion + + +class EMO(Module): + """EMO Module + + Prosody Analysis of user voice speech. + + input: voice, + output: emotion + """ + + input_type = "voice" + output_type = "emotion" + + def __init__( + self, + model_name: str = "superb/hubert-large-superb-er", + sample_rate: int = 16000, + block_duration: float = 0.020, # s + analysis_window: float = 5.0, # s + ): + super().__init__() + + self.model = AutoModelForAudioClassification.from_pretrained(model_name) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) + + self.sample_rate = sample_rate + self.window_size = int(analysis_window / block_duration) + + self.buffer: List[np.ndarray] = [] + + self.silence: bool = True + + self.running = False + self.lock: asyncio.Lock = asyncio.Lock() + + def _predict_emotion(self, audio_np: np.ndarray): + inputs = self.feature_extractor( + audio_np, sampling_rate=self.sample_rate, return_tensors="pt", padding=True + ) + + with torch.no_grad(): + logits = self.model(**inputs).logits + probs = torch.softmax(logits, dim=-1)[0] + + predicted_id = torch.argmax(probs).item() + + labels = self.model.config.id2label + + return { + "label": labels[predicted_id], + "confidence": float(probs[predicted_id]), + "scores": {labels[i]: float(probs[i]) for i in range(len(labels))}, + } + + async def process(self, voice: Voice) -> Optional[Emotion]: + if voice.data is None: + self.silence = True + else: + self.silence = False + async with self.lock: + self.buffer.append(voice.data) + + async with self.lock: + if self.running: + return None + self.running = True + + async with self.lock: + buffer_size = len(self.buffer) + if buffer_size == 0 or ( + self.silence is False and buffer_size < self.window_size + ): + self.running = False + return None + processing_chunks = self.buffer[: self.window_size] + + processing_audio = np.concatenate(processing_chunks, axis=0) + + emotion_result = await asyncio.to_thread( + self._predict_emotion, audio_np=processing_audio + ) + + async with self.lock: + self.buffer = self.buffer[self.window_size :] + self.running = False + + return Emotion( + emotion_result["label"], + emotion_result["confidence"], + emotion_result["scores"], + self.silence, + ) diff --git a/src/modules/events.py b/src/modules/events.py index 43f6c71..5ff1dfd 100644 --- a/src/modules/events.py +++ b/src/modules/events.py @@ -1,6 +1,7 @@ from typing import Dict, Type from src.core.events import EventData +from src.modules.emotion.events import Emotion from src.modules.rag.events import RAGResult from src.modules.speech_to_text.events import Sentence, Transcript, Voice @@ -10,6 +11,7 @@ def get_events() -> Dict[str, Type[EventData | bytes]]: "audio": bytes, "voice": Voice, "transcript": Transcript, + "emotion": Emotion, "question": Sentence, "rag_response": RAGResult, } diff --git a/src/modules/modules.py b/src/modules/modules.py index 8fbc53c..e72dda6 100644 --- a/src/modules/modules.py +++ b/src/modules/modules.py @@ -1,5 +1,6 @@ from typing import Dict, Type +from src.modules.emotion.prosody_analysis import EMO from src.modules.rag.rag import RAG from src.modules.speech_to_text.microphone_vad import MIC from src.modules.speech_to_text.speech_to_text import STT @@ -9,4 +10,4 @@ def get_modules() -> Dict[str, Type[Module]]: - return {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG} + return {"mic": MIC, "stt": STT, "tag": TAG, "emo": EMO, "rag": RAG} diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 63bf060..3d71403 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -52,9 +52,6 @@ def __init__( self.silence: bool = True - self.prev_text: str = "" - self.stable_text: str = "" - self.running = False self.lock: asyncio.Lock = asyncio.Lock() @@ -80,7 +77,6 @@ async def process(self, voice: Voice) -> Optional[Transcript]: return None processing_chunks = self.buffer[: self.window_size] - self.pending_silence = False processing_audio = np.concatenate(processing_chunks, axis=0) def transcribe_text(): From 93189957c3a173aba2ac71a91d002a8c05d5f270 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 11:16:10 +0200 Subject: [PATCH 12/17] evol(rag): partial question instead of sentence --- src/interfaces/cli_interface.py | 8 ++++---- src/modules/__init__.py | 0 src/modules/emotion/__init__.py | 0 src/modules/events.py | 7 ++++--- src/modules/rag/__init__.py | 0 src/modules/rag/events.py | 19 +++++++++++++++++++ src/modules/rag/rag.py | 9 +++++---- src/modules/speech_to_text/events.py | 5 ----- src/modules/speech_to_text/text_aggregator.py | 13 +++++++------ 9 files changed, 39 insertions(+), 22 deletions(-) create mode 100644 src/modules/__init__.py create mode 100644 src/modules/emotion/__init__.py create mode 100644 src/modules/rag/__init__.py diff --git a/src/interfaces/cli_interface.py b/src/interfaces/cli_interface.py index 2634d07..212f4ff 100644 --- a/src/interfaces/cli_interface.py +++ b/src/interfaces/cli_interface.py @@ -10,7 +10,7 @@ from src.core.client import ClientHook, ClientSender from src.core.interface import Interface from src.modules.rag.events import RAGResult -from src.modules.speech_to_text.events import Sentence +from src.modules.speech_to_text.events import Transcript class AudioSender(ClientSender[bytes]): @@ -42,8 +42,8 @@ def callback(indata: np.ndarray, frames, time, status): await self.send(ws, chunk.tobytes()) -class TextSender(ClientSender[Sentence]): - output_type = Sentence +class TextSender(ClientSender[Transcript]): + output_type = Transcript def __init__(self, **kwargs): super().__init__(**kwargs) @@ -57,7 +57,7 @@ async def input_loop(self, ws): text = await session.prompt_async(">> ") if text == "\\exit": return - await self.send(ws, Sentence(text)) + await self.send(ws, Transcript(text, end=True)) except (EOFError, KeyboardInterrupt): pass diff --git a/src/modules/__init__.py b/src/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/emotion/__init__.py b/src/modules/emotion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/events.py b/src/modules/events.py index 5ff1dfd..b8d42d2 100644 --- a/src/modules/events.py +++ b/src/modules/events.py @@ -2,8 +2,8 @@ from src.core.events import EventData from src.modules.emotion.events import Emotion -from src.modules.rag.events import RAGResult -from src.modules.speech_to_text.events import Sentence, Transcript, Voice +from src.modules.rag.events import RAGResult, RAGQuestion, PartialQuestion +from src.modules.speech_to_text.events import Transcript, Voice def get_events() -> Dict[str, Type[EventData | bytes]]: @@ -12,6 +12,7 @@ def get_events() -> Dict[str, Type[EventData | bytes]]: "voice": Voice, "transcript": Transcript, "emotion": Emotion, - "question": Sentence, + "partial_question": PartialQuestion, + "question": RAGQuestion, "rag_response": RAGResult, } diff --git a/src/modules/rag/__init__.py b/src/modules/rag/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/rag/events.py b/src/modules/rag/events.py index 5d237d2..24f883f 100644 --- a/src/modules/rag/events.py +++ b/src/modules/rag/events.py @@ -1,6 +1,9 @@ from dataclasses import dataclass, field +from typing import Optional from src.core.events import EventData +from src.modules.emotion.events import Emotion +from src.modules.speech_to_text.events import Transcript @dataclass @@ -9,3 +12,19 @@ class RAGResult(EventData): answer: str sources: list[dict] = field(default_factory=list) + + +@dataclass +class PartialQuestion(EventData): + """Partial question used to aggregate a sentence to an emotion.""" + + transcript: Optional[Transcript] + emotion: Optional[Emotion] + + +@dataclass +class RAGQuestion(EventData): + """Fully aggregated question to send to the RAG.""" + + transcript: Transcript + emotion: Optional[Emotion] diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index 6b9744d..a49fdc4 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -9,9 +9,8 @@ from sentence_transformers import SentenceTransformer from src.core.module import ModuleWithHandle, ModuleWithId -from src.modules.speech_to_text.events import Sentence -from .events import RAGResult +from .events import RAGResult, RAGQuestion @dataclass @@ -300,12 +299,14 @@ def __init__( "extra_instructions": extra_instructions, } - async def process(self, data: Sentence) -> Optional[RAGResult]: + async def process(self, data: RAGQuestion) -> Optional[RAGResult]: """ Called when a "question" event arrives through the event bus. Packages _user_id + question, sends to the stateless RAGHandle. """ - question_text = data.text + question_text = data.transcript.text + + print(data) query = RAGQuery( _user_id=self._user_id if self._user_id else "anonymous", diff --git a/src/modules/speech_to_text/events.py b/src/modules/speech_to_text/events.py index e80f522..fc04674 100644 --- a/src/modules/speech_to_text/events.py +++ b/src/modules/speech_to_text/events.py @@ -15,8 +15,3 @@ class Transcript(EventData): @dataclass class Voice(EventData): data: Optional[np.ndarray] - - -@dataclass -class Sentence(EventData): - text: str diff --git a/src/modules/speech_to_text/text_aggregator.py b/src/modules/speech_to_text/text_aggregator.py index 72760af..b9ef1ce 100644 --- a/src/modules/speech_to_text/text_aggregator.py +++ b/src/modules/speech_to_text/text_aggregator.py @@ -2,8 +2,9 @@ from typing import Optional from src.core.module import Module +from src.modules.rag.events import PartialQuestion -from .events import Sentence, Transcript +from .events import Transcript class TAG(Module): @@ -12,11 +13,11 @@ class TAG(Module): Aggregate all transcriptions and send when transcript end. input: transcript, - output: question + output: partial_question """ input_type = "transcript" - output_type = "question" + output_type = "partial_question" def __init__( self, @@ -36,7 +37,7 @@ def _merge(self, current: str, new: str) -> str: self.prev_index = len(current) return current + new - async def process(self, transcript: Transcript) -> Optional[Sentence]: + async def process(self, transcript: Transcript) -> Optional[PartialQuestion]: text = transcript.text if text != "": @@ -46,9 +47,9 @@ async def process(self, transcript: Transcript) -> Optional[Sentence]: self.sentence = self._merge(self.sentence, text) if transcript.end and self.sentence != "": - sentence = Sentence(self.sentence) + transcript = Transcript(self.sentence, True) self.sentence = "" self.prev_index = 0 - return sentence + return PartialQuestion(transcript=transcript, emotion=None) else: return None From 6dcd1d3bed02e7d8d5394def7304d30c56df1154 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 11:16:44 +0200 Subject: [PATCH 13/17] feat(emotion): emotion aggregator module (EAG) --- src/modules/emotion/emotion_aggregator.py | 68 +++++++++++++++++++++++ src/modules/modules.py | 12 +++- 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 src/modules/emotion/emotion_aggregator.py diff --git a/src/modules/emotion/emotion_aggregator.py b/src/modules/emotion/emotion_aggregator.py new file mode 100644 index 0000000..1eabd10 --- /dev/null +++ b/src/modules/emotion/emotion_aggregator.py @@ -0,0 +1,68 @@ +from collections import defaultdict +from typing import Optional + +from src.core.module import Module +from src.modules.rag.events import PartialQuestion + +from .events import Emotion + + +class EAG(Module): + """EAG Module + + Aggregate all emotions and send when voice end. + + input: emotion, + output: partial_question + """ + + input_type = "emotion" + output_type = "partial_question" + + def __init__(self, ema_alpha: Optional[float] = None): + super().__init__() + + self.scores = defaultdict(float) + self.count = 0 + + self.ema_alpha = ema_alpha + + def _finalize(self) -> Emotion: + avg_scores = ( + {label: score / self.count for label, score in self.scores.items()} + if self.ema_alpha is None + else self.scores + ) + + best_label = max(avg_scores, key=avg_scores.get) + + result = Emotion( + label=best_label, + confidence=avg_scores[best_label], + scores=avg_scores, + end=True, + ) + + self.scores.clear() + self.count = 0 + + return result + + async def process(self, emotion: Emotion) -> Optional[PartialQuestion]: + if self.ema_alpha is not None: + for label, score in emotion.scores.items(): + self.scores[label] = ( + self.ema_alpha * score + (1 - self.ema_alpha) * self.scores[label] + ) + else: + for label, score in emotion.scores.items(): + self.scores[label] += score + + self.count += 1 + + if emotion.end: + emotion = self._finalize() + + return PartialQuestion(transcript=None, emotion=emotion) + + return None diff --git a/src/modules/modules.py b/src/modules/modules.py index e72dda6..ce60461 100644 --- a/src/modules/modules.py +++ b/src/modules/modules.py @@ -1,6 +1,8 @@ from typing import Dict, Type +from src.modules.emotion.emotion_aggregator import EAG from src.modules.emotion.prosody_analysis import EMO +from src.modules.rag.question_aggregator import QAG from src.modules.rag.rag import RAG from src.modules.speech_to_text.microphone_vad import MIC from src.modules.speech_to_text.speech_to_text import STT @@ -10,4 +12,12 @@ def get_modules() -> Dict[str, Type[Module]]: - return {"mic": MIC, "stt": STT, "tag": TAG, "emo": EMO, "rag": RAG} + return { + "mic": MIC, + "stt": STT, + "tag": TAG, + "emo": EMO, + "eag": EAG, + "qag": QAG, + "rag": RAG, + } From 2eeb0ff6800d6a8adf4ef3523977fff93fbf90ea Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 11:17:28 +0200 Subject: [PATCH 14/17] feat(rag): question aggregator module (QAG); aggregate emotion and transcript into RAGQuestion --- config/{client_aux .yaml => client_aux.yaml} | 11 +++-- src/modules/rag/question_aggregator.py | 44 ++++++++++++++++++++ 2 files changed, 52 insertions(+), 3 deletions(-) rename config/{client_aux .yaml => client_aux.yaml} (86%) create mode 100644 src/modules/rag/question_aggregator.py diff --git a/config/client_aux .yaml b/config/client_aux.yaml similarity index 86% rename from config/client_aux .yaml rename to config/client_aux.yaml index d1f5195..a82834b 100644 --- a/config/client_aux .yaml +++ b/config/client_aux.yaml @@ -17,9 +17,6 @@ senders: frame_duration: 0.030 hooks: - text: - name: text - topics: [question, answer] audio: name: audio topics: [audio] @@ -41,6 +38,14 @@ modules: block_duration: ${senders.audio.args.frame_duration} tag: name: tag + emo: + name: emo + args: + block_duration: ${senders.audio.args.frame_duration} + eag: + name: eag + qag: + name: qag rag: name: rag args: diff --git a/src/modules/rag/question_aggregator.py b/src/modules/rag/question_aggregator.py new file mode 100644 index 0000000..0424ea5 --- /dev/null +++ b/src/modules/rag/question_aggregator.py @@ -0,0 +1,44 @@ +from typing import Optional + +from src.core.module import Module +from src.modules.emotion.events import Emotion +from src.modules.speech_to_text.events import Transcript + +from .events import PartialQuestion, RAGQuestion + + +class QAG(Module): + """QAG Module + + Aggregate sentence and emotion into a RAGQuestion. + + input: partial_question, + output: question + """ + + input_type = "partial_question" + output_type = "question" + + def __init__(self, use_emotion: bool = True): + super().__init__() + + self.current_transcript: Optional[Transcript] = None + self.current_emotion: Optional[Emotion] = None + + self.use_emotion = use_emotion + + async def process(self, partial_question: PartialQuestion) -> Optional[RAGQuestion]: + if partial_question.emotion is not None: + self.current_emotion = partial_question.emotion + + if partial_question.transcript is not None: + self.current_transcript = partial_question.transcript + + if self.current_transcript is not None: + if self.use_emotion: + if self.current_emotion is not None: + return RAGQuestion(self.current_transcript, self.current_emotion) + else: + return RAGQuestion(self.current_transcript, None) + + return None From d8ec74248d760d62ea85866e5a8bc5611f72efb1 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 11:35:44 +0200 Subject: [PATCH 15/17] evol(module): STT + EMO documentation --- src/modules/emotion/prosody_analysis.py | 7 ++++++- src/modules/speech_to_text/speech_to_text.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/modules/emotion/prosody_analysis.py b/src/modules/emotion/prosody_analysis.py index 8510330..d6319cd 100644 --- a/src/modules/emotion/prosody_analysis.py +++ b/src/modules/emotion/prosody_analysis.py @@ -18,6 +18,11 @@ class EMO(Module): input: voice, output: emotion + + :model_name: name of the Emotion Analysis model. + :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000. + :block_duration: size of received voice audio (in s). + :analysis_window: duration of audio per analysis (in s). """ input_type = "voice" @@ -28,7 +33,7 @@ def __init__( model_name: str = "superb/hubert-large-superb-er", sample_rate: int = 16000, block_duration: float = 0.020, # s - analysis_window: float = 5.0, # s + analysis_window: float = 4.0, # s ): super().__init__() diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 3d71403..6531e87 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -25,6 +25,8 @@ class STT(Module): as "en" or "fr". :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000. :block_duration: size of received voice audio (in s). + :transcribe_window: duration of audio per transcription (in s). + :transcribe_step: overlap between consecutive transcription windows (in s). """ input_type = "voice" From e9a53e98d2cea592b9ffc0f53a0d678c2ab395f3 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 11:45:55 +0200 Subject: [PATCH 16/17] fix(linter): make lint --- src/modules/emotion/emotion_aggregator.py | 13 +++++++++---- src/modules/emotion/prosody_analysis.py | 2 +- src/modules/events.py | 2 +- src/modules/rag/rag.py | 4 +--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/modules/emotion/emotion_aggregator.py b/src/modules/emotion/emotion_aggregator.py index 1eabd10..f3144fa 100644 --- a/src/modules/emotion/emotion_aggregator.py +++ b/src/modules/emotion/emotion_aggregator.py @@ -1,5 +1,5 @@ from collections import defaultdict -from typing import Optional +from typing import Dict, Optional from src.core.module import Module from src.modules.rag.events import PartialQuestion @@ -14,6 +14,11 @@ class EAG(Module): input: emotion, output: partial_question + + :ema_alpha: if not None, the aggragation will use ema computation instead + of average. Recents emotion will have stronger impact on the final score. + Lower alpha will make impact lower, and higher alpha will make it higher. \ + Default alpha would be ~0.3. """ input_type = "emotion" @@ -22,8 +27,8 @@ class EAG(Module): def __init__(self, ema_alpha: Optional[float] = None): super().__init__() - self.scores = defaultdict(float) - self.count = 0 + self.scores: Dict[str, float] = defaultdict(float) + self.count: int = 0 self.ema_alpha = ema_alpha @@ -34,7 +39,7 @@ def _finalize(self) -> Emotion: else self.scores ) - best_label = max(avg_scores, key=avg_scores.get) + best_label = max(avg_scores, key=lambda label: avg_scores[label]) result = Emotion( label=best_label, diff --git a/src/modules/emotion/prosody_analysis.py b/src/modules/emotion/prosody_analysis.py index d6319cd..cd50b3d 100644 --- a/src/modules/emotion/prosody_analysis.py +++ b/src/modules/emotion/prosody_analysis.py @@ -59,7 +59,7 @@ def _predict_emotion(self, audio_np: np.ndarray): logits = self.model(**inputs).logits probs = torch.softmax(logits, dim=-1)[0] - predicted_id = torch.argmax(probs).item() + predicted_id = int(torch.argmax(probs).item()) labels = self.model.config.id2label diff --git a/src/modules/events.py b/src/modules/events.py index b8d42d2..387bdab 100644 --- a/src/modules/events.py +++ b/src/modules/events.py @@ -2,7 +2,7 @@ from src.core.events import EventData from src.modules.emotion.events import Emotion -from src.modules.rag.events import RAGResult, RAGQuestion, PartialQuestion +from src.modules.rag.events import PartialQuestion, RAGQuestion, RAGResult from src.modules.speech_to_text.events import Transcript, Voice diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index a49fdc4..e14e7c8 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -10,7 +10,7 @@ from src.core.module import ModuleWithHandle, ModuleWithId -from .events import RAGResult, RAGQuestion +from .events import RAGQuestion, RAGResult @dataclass @@ -306,8 +306,6 @@ async def process(self, data: RAGQuestion) -> Optional[RAGResult]: """ question_text = data.transcript.text - print(data) - query = RAGQuery( _user_id=self._user_id if self._user_id else "anonymous", question=question_text, From 4fc5982585cb50d29294433fae1cd261224026d5 Mon Sep 17 00:00:00 2001 From: Popochounet Date: Sun, 14 Jun 2026 11:54:07 +0200 Subject: [PATCH 17/17] evol(module): QAG documentation --- src/modules/rag/question_aggregator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/modules/rag/question_aggregator.py b/src/modules/rag/question_aggregator.py index 0424ea5..8fd864b 100644 --- a/src/modules/rag/question_aggregator.py +++ b/src/modules/rag/question_aggregator.py @@ -14,6 +14,9 @@ class QAG(Module): input: partial_question, output: question + + :use_emotion: default True. + Set to False if you do not analyze emotion or do not need it. """ input_type = "partial_question"