From 229ceb4142e2de3ea7ed36f566a7ae8068a25daa Mon Sep 17 00:00:00 2001 From: argenis de la rosa Date: Wed, 4 Mar 2026 05:51:43 -0500 Subject: [PATCH] feat(matrix): support voice transcription with E2EE media on dev --- src/channels/matrix.rs | 84 +++++++++++++++++++++++++++++++++++++++--- src/channels/mod.rs | 3 +- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/src/channels/matrix.rs b/src/channels/matrix.rs index c3f6a5559..4289d5078 100644 --- a/src/channels/matrix.rs +++ b/src/channels/matrix.rs @@ -3,6 +3,7 @@ use async_trait::async_trait; use matrix_sdk::{ authentication::matrix::MatrixSession, config::SyncSettings, + media::{MediaFormat, MediaRequestParameters}, ruma::{ events::room::message::{ MessageType, OriginalSyncRoomMessageEvent, Relation, RoomMessageEventContent, @@ -33,6 +34,7 @@ pub struct MatrixChannel { resolved_room_id_cache: Arc>>, sdk_client: Arc>, http_client: Client, + transcription: Option, } impl std::fmt::Debug for MatrixChannel { @@ -41,6 +43,7 @@ impl std::fmt::Debug for MatrixChannel { .field("homeserver", &self.homeserver) .field("room_id", &self.room_id) .field("allowed_users", &self.allowed_users) + .field("transcription_enabled", &self.transcription.is_some()) .finish_non_exhaustive() } } @@ -172,9 +175,18 @@ impl MatrixChannel { resolved_room_id_cache: Arc::new(RwLock::new(None)), sdk_client: Arc::new(OnceCell::new()), http_client: Client::new(), + transcription: None, } } + /// Configure voice transcription. + pub fn with_transcription(mut self, config: crate::config::TranscriptionConfig) -> Self { + if config.enabled { + self.transcription = Some(config); + } + self + } + pub fn with_mention_only(mut self, mention_only: bool) -> Self { self.mention_only = mention_only; self @@ -224,7 +236,7 @@ impl MatrixChannel { } fn is_supported_message_type(msgtype: &str) -> bool { - matches!(msgtype, "m.text" | "m.notice") + matches!(msgtype, "m.text" | "m.notice" | "m.audio") } fn has_non_empty_body(body: &str) -> bool { @@ -745,6 +757,7 @@ impl Channel for MatrixChannel { let dedupe_for_handler = Arc::clone(&recent_event_cache); let bot_dedupe_for_handler = Arc::clone(&recent_bot_event_cache); let mention_only_for_handler = self.mention_only; + let transcription_for_handler = self.transcription.clone(); client.add_event_handler(move |event: OriginalSyncRoomMessageEvent, room: Room| { let tx = tx_handler.clone(); @@ -753,6 +766,7 @@ impl Channel for MatrixChannel { let allowed_users = allowed_users_for_handler.clone(); let dedupe = Arc::clone(&dedupe_for_handler); let bot_dedupe = Arc::clone(&bot_dedupe_for_handler); + let transcription = transcription_for_handler.clone(); async move { if room.room_id().as_str() != target_room.as_str() { @@ -776,6 +790,45 @@ impl Channel for MatrixChannel { let body = match &event.content.msgtype { MessageType::Text(content) => content.body.clone(), MessageType::Notice(content) => content.body.clone(), + MessageType::Audio(content) => { + // Check if transcription is enabled + if let Some(ref tc) = transcription { + if tc.enabled { + // 2. Media Acquisition + // We use the room's client to fetch the actual media bytes. + // This handles decryption automatically if the room is E2EE. + let media_request = MediaRequestParameters { + source: content.source.clone(), + format: MediaFormat::File, + }; + match room.client().media().get_media_content(&media_request, true).await { + Ok(audio_data) => { + // 3. Transcription via local service + match crate::channels::transcription::transcribe_audio( + audio_data, + &content.body, + tc, + ).await { + Ok(text) => text, + Err(e) => { + tracing::warn!("Matrix voice transcription failed: {e}"); + format!("[Audio transcription failed: {}]", content.body) + } + } + } + Err(e) => { + let safe_error = MatrixChannel::sanitize_error_for_log(&e); + tracing::warn!("Failed to download Matrix audio: {safe_error}"); + format!("[Audio download failed: {}]", content.body) + } + } + } else { + return; + } + } else { + return; + } + } _ => return, }; @@ -783,18 +836,18 @@ impl Channel for MatrixChannel { return; } - let mut is_direct_room = false; - let mut is_mentioned = false; - let mut is_reply_to_bot = false; - if mention_only_for_handler { - is_direct_room = room.is_direct().await.unwrap_or_else(|error| { + let is_direct_room = room.is_direct().await.unwrap_or_else(|error| { let safe_error = MatrixChannel::sanitize_error_for_log(&error); tracing::warn!( "Matrix is_direct() failed while evaluating mention_only gate: {safe_error}" ); false }); + + let mut is_mentioned = false; + let mut is_reply_to_bot = false; + if !is_direct_room { is_mentioned = MatrixChannel::event_mentions_user(&event, &body, my_user_id.as_str()); @@ -1018,10 +1071,29 @@ mod tests { fn supported_message_type_detection() { assert!(MatrixChannel::is_supported_message_type("m.text")); assert!(MatrixChannel::is_supported_message_type("m.notice")); + assert!(MatrixChannel::is_supported_message_type("m.audio")); assert!(!MatrixChannel::is_supported_message_type("m.image")); assert!(!MatrixChannel::is_supported_message_type("m.file")); } + #[test] + fn with_transcription_configures_enabled_config() { + let mut tc = crate::config::TranscriptionConfig::default(); + tc.enabled = true; + + let ch = make_channel().with_transcription(tc); + assert!(ch.transcription.is_some()); + } + + #[test] + fn with_transcription_ignores_disabled_config() { + let mut tc = crate::config::TranscriptionConfig::default(); + tc.enabled = false; + + let ch = make_channel().with_transcription(tc); + assert!(ch.transcription.is_none()); + } + #[test] fn body_presence_detection() { assert!(MatrixChannel::has_non_empty_body("hello")); diff --git a/src/channels/mod.rs b/src/channels/mod.rs index b115bc177..8581076af 100644 --- a/src/channels/mod.rs +++ b/src/channels/mod.rs @@ -4254,7 +4254,8 @@ fn collect_configured_channels( mx.device_id.clone(), config.config_path.parent().map(|path| path.to_path_buf()), ) - .with_mention_only(mx.mention_only), + .with_mention_only(mx.mention_only) + .with_transcription(config.transcription.clone()), ), }); }