diff --git a/src/channels/telegram.rs b/src/channels/telegram.rs index 314d6fa18..0f67f9bba 100644 --- a/src/channels/telegram.rs +++ b/src/channels/telegram.rs @@ -330,6 +330,7 @@ pub struct TelegramChannel { /// Override for local Bot API servers or testing. api_base: String, transcription: Option, + voice_transcriptions: Mutex>, } impl TelegramChannel { @@ -359,6 +360,7 @@ impl TelegramChannel { bot_username: Mutex::new(None), api_base: "https://api.telegram.org".to_string(), transcription: None, + voice_transcriptions: Mutex::new(std::collections::HashMap::new()), } } @@ -915,7 +917,16 @@ Allowlist Telegram username (without '@') or numeric user ID.", return None; } - let content = if let Some(quote) = Self::extract_reply_context(message) { + // Cache transcription for reply-context lookups + { + let mut cache = self.voice_transcriptions.lock(); + if cache.len() >= 100 { + cache.clear(); + } + cache.insert(format!("{chat_id}:{message_id}"), text.clone()); + } + + let content = if let Some(quote) = self.extract_reply_context(message) { format!("{quote}\n\n[Voice] {text}") } else { format!("[Voice] {text}") @@ -957,7 +968,7 @@ Allowlist Telegram username (without '@') or numeric user ID.", } /// Extract reply context from a Telegram `reply_to_message`, if present. - fn extract_reply_context(message: &serde_json::Value) -> Option { + fn extract_reply_context(&self, message: &serde_json::Value) -> Option { let reply = message.get("reply_to_message")?; let reply_sender = reply @@ -975,7 +986,20 @@ Allowlist Telegram username (without '@') or numeric user ID.", let reply_text = if let Some(text) = reply.get("text").and_then(serde_json::Value::as_str) { text.to_string() } else if reply.get("voice").is_some() || reply.get("audio").is_some() { - "[Voice message]".to_string() + let reply_mid = reply.get("message_id").and_then(serde_json::Value::as_i64); + let chat_id = message + .get("chat") + .and_then(|c| c.get("id")) + .and_then(serde_json::Value::as_i64); + if let (Some(mid), Some(cid)) = (reply_mid, chat_id) { + self.voice_transcriptions + .lock() + .get(&format!("{cid}:{mid}")) + .map(|t| format!("[Voice] {t}")) + .unwrap_or_else(|| "[Voice message]".to_string()) + } else { + "[Voice message]".to_string() + } } else if reply.get("photo").is_some() { "[Photo]".to_string() } else if reply.get("document").is_some() { @@ -1081,7 +1105,7 @@ Allowlist Telegram username (without '@') or numeric user ID.", text.to_string() }; - let content = if let Some(quote) = Self::extract_reply_context(message) { + let content = if let Some(quote) = self.extract_reply_context(message) { format!("{quote}\n\n{content}") } else { content @@ -3405,38 +3429,42 @@ mod tests { #[test] fn extract_reply_context_text_message() { + let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); let msg = serde_json::json!({ "reply_to_message": { "from": { "username": "alice" }, "text": "Hello world" } }); - let ctx = TelegramChannel::extract_reply_context(&msg).unwrap(); + let ctx = ch.extract_reply_context(&msg).unwrap(); assert_eq!(ctx, "> @alice:\n> Hello world"); } #[test] fn extract_reply_context_voice_message() { + let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); let msg = serde_json::json!({ "reply_to_message": { "from": { "username": "bob" }, "voice": { "file_id": "abc", "duration": 5 } } }); - let ctx = TelegramChannel::extract_reply_context(&msg).unwrap(); + let ctx = ch.extract_reply_context(&msg).unwrap(); assert_eq!(ctx, "> @bob:\n> [Voice message]"); } #[test] fn extract_reply_context_no_reply() { + let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); let msg = serde_json::json!({ "text": "just a regular message" }); - assert!(TelegramChannel::extract_reply_context(&msg).is_none()); + assert!(ch.extract_reply_context(&msg).is_none()); } #[test] fn extract_reply_context_truncates_long_text() { + let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); let long_text = "a".repeat(300); let msg = serde_json::json!({ "reply_to_message": { @@ -3444,7 +3472,7 @@ mod tests { "text": long_text } }); - let ctx = TelegramChannel::extract_reply_context(&msg).unwrap(); + let ctx = ch.extract_reply_context(&msg).unwrap(); // Should contain truncation marker assert!(ctx.contains('…')); // The quoted content (after "> ") should be 200 chars + "…" @@ -3459,16 +3487,36 @@ mod tests { #[test] fn extract_reply_context_no_username_uses_first_name() { + let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); let msg = serde_json::json!({ "reply_to_message": { "from": { "id": 999, "first_name": "Charlie" }, "text": "Hi there" } }); - let ctx = TelegramChannel::extract_reply_context(&msg).unwrap(); + let ctx = ch.extract_reply_context(&msg).unwrap(); assert_eq!(ctx, "> @Charlie:\n> Hi there"); } + #[test] + fn extract_reply_context_voice_with_cached_transcription() { + let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); + // Pre-populate transcription cache + ch.voice_transcriptions + .lock() + .insert("100:42".to_string(), "Hello from voice".to_string()); + let msg = serde_json::json!({ + "chat": { "id": 100 }, + "reply_to_message": { + "message_id": 42, + "from": { "username": "bob" }, + "voice": { "file_id": "abc", "duration": 5 } + } + }); + let ctx = ch.extract_reply_context(&msg).unwrap(); + assert_eq!(ctx, "> @bob:\n> [Voice] Hello from voice"); + } + #[test] fn parse_update_message_includes_reply_context() { let ch = TelegramChannel::new("t".into(), vec!["*".into()], false); @@ -3517,4 +3565,92 @@ mod tests { TelegramChannel::new("token".into(), vec!["*".into()], false).with_transcription(tc); assert!(ch.transcription.is_none()); } + + // ───────────────────────────────────────────────────────────────────── + // Live e2e: voice transcription via Groq Whisper + reply cache lookup + // ───────────────────────────────────────────────────────────────────── + + /// Live test: voice transcription via Groq Whisper + reply cache lookup. + /// + /// Loads a pre-recorded MP3 fixture ("hello"), sends it to Groq Whisper + /// API, verifies the transcription contains "hello", then caches it and + /// checks that `extract_reply_context` returns the cached text instead + /// of the `[Voice message]` fallback placeholder. + /// + /// Skipped automatically when `GROQ_API_KEY` is not set. + /// Run: `GROQ_API_KEY= cargo test --lib -- telegram::tests::e2e_live_voice_transcription_and_reply_cache --ignored` + #[tokio::test] + #[ignore] + async fn e2e_live_voice_transcription_and_reply_cache() { + if std::env::var("GROQ_API_KEY").is_err() { + eprintln!("GROQ_API_KEY not set — skipping live voice transcription test"); + return; + } + + // 1. Load pre-recorded fixture (TTS-generated "hello", ~7 KB MP3) + let fixture_path = + std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/hello.mp3"); + let audio_data = std::fs::read(&fixture_path).unwrap_or_else(|e| { + panic!( + "Failed to read fixture {}: {e}", + fixture_path.display() + ) + }); + assert!( + audio_data.len() > 1000, + "fixture too small ({} bytes), likely corrupt", + audio_data.len() + ); + + // 2. Call transcribe_audio() — real Groq Whisper API + let config = crate::config::TranscriptionConfig { + enabled: true, + ..Default::default() + }; + let transcript: String = + crate::channels::transcription::transcribe_audio(audio_data, "hello.mp3", &config) + .await + .expect("transcribe_audio should succeed with valid GROQ_API_KEY"); + + // 3. Verify Whisper actually recognized "hello" + assert!( + transcript.to_lowercase().contains("hello"), + "expected transcription to contain 'hello', got: '{transcript}'" + ); + + // 4. Create TelegramChannel, insert transcription into voice_transcriptions cache + let ch = TelegramChannel::new("test_token".into(), vec!["*".into()], false); + let chat_id: i64 = 12345; + let message_id: i64 = 67; + let cache_key = format!("{chat_id}:{message_id}"); + ch.voice_transcriptions + .lock() + .insert(cache_key, transcript.clone()); + + // 5. Build reply message with voice + message_id + chat.id + let msg = serde_json::json!({ + "chat": { "id": chat_id }, + "reply_to_message": { + "message_id": message_id, + "from": { "username": "zeroclaw_user" }, + "voice": { "file_id": "test_file", "duration": 1 } + } + }); + + // 6. Verify extract_reply_context returns cached transcription + let ctx = ch + .extract_reply_context(&msg) + .expect("extract_reply_context should return Some for voice reply"); + + assert!( + ctx.contains(&format!("[Voice] {transcript}")), + "expected cached transcription in reply context, got: {ctx}" + ); + + // Must NOT contain the fallback placeholder + assert!( + !ctx.contains("[Voice message]"), + "context should use cached transcription, not fallback placeholder, got: {ctx}" + ); + } } diff --git a/tests/fixtures/hello.mp3 b/tests/fixtures/hello.mp3 new file mode 100644 index 000000000..bf0136639 Binary files /dev/null and b/tests/fixtures/hello.mp3 differ