From 61398eb900d5325200b31d2092c8afefaf443724 Mon Sep 17 00:00:00 2001 From: Chummy Date: Mon, 2 Mar 2026 14:49:51 +0800 Subject: [PATCH] fix(channels): robust qq/feishu image delivery and multimodal proxy fetch routing --- docs/config-reference.md | 5 + docs/proxy-agent-playbook.md | 4 +- src/agent/loop_.rs | 8 +- src/channels/lark.rs | 481 ++++++++++++++++++++++++++++++++-- src/channels/mod.rs | 18 ++ src/channels/qq.rs | 275 +++++++++++++++++-- src/config/schema.rs | 1 + src/gateway/mod.rs | 13 +- src/multimodal.rs | 205 +++++++++++++-- src/providers/openai_codex.rs | 14 +- 10 files changed, 952 insertions(+), 72 deletions(-) diff --git a/docs/config-reference.md b/docs/config-reference.md index d78d5fae2..b614c817a 100644 --- a/docs/config-reference.md +++ b/docs/config-reference.md @@ -622,6 +622,11 @@ Notes: - Remote URL only when `allow_remote_fetch = true` - Allowed MIME types: `image/png`, `image/jpeg`, `image/webp`, `image/gif`, `image/bmp`. - When the active provider does not support vision, requests fail with a structured capability error (`capability=vision`) instead of silently dropping images. +- In `proxy.scope = "services"` mode, remote image fetch uses service-key routing. For best compatibility include relevant selectors/keys such as: + - `channel.qq` (QQ media hosts like `multimedia.nt.qq.com.cn`) + - `tool.multimodal` (dedicated multimodal fetch path) + - `tool.http_request` (compatibility fallback path) + - `provider.*` or the active provider key (for example `provider.openai`) ## `[browser]` diff --git a/docs/proxy-agent-playbook.md b/docs/proxy-agent-playbook.md index 5e1cbefff..0d4f2cca5 100644 --- a/docs/proxy-agent-playbook.md +++ b/docs/proxy-agent-playbook.md @@ -113,14 +113,14 @@ Use when only part of the system should use proxy (for example specific provider ### 5.1 Target specific services ```json -{"action":"set","enabled":true,"scope":"services","services":["provider.openai","tool.http_request","channel.telegram"],"all_proxy":"socks5h://127.0.0.1:1080","no_proxy":["localhost","127.0.0.1",".internal"]} +{"action":"set","enabled":true,"scope":"services","services":["provider.openai","tool.multimodal","tool.http_request","channel.telegram"],"all_proxy":"socks5h://127.0.0.1:1080","no_proxy":["localhost","127.0.0.1",".internal"]} {"action":"get"} ``` ### 5.2 Target by selectors ```json -{"action":"set","enabled":true,"scope":"services","services":["provider.*","tool.*"],"http_proxy":"http://127.0.0.1:7890"} +{"action":"set","enabled":true,"scope":"services","services":["provider.*","tool.*","channel.qq"],"http_proxy":"http://127.0.0.1:7890"} {"action":"get"} ``` diff --git a/src/agent/loop_.rs b/src/agent/loop_.rs index b6f7667e3..95fe8fb7e 100644 --- a/src/agent/loop_.rs +++ b/src/agent/loop_.rs @@ -1146,8 +1146,12 @@ pub async fn run_tool_call_loop( .into()); } - let prepared_messages = - multimodal::prepare_messages_for_provider(history, multimodal_config).await?; + let prepared_messages = multimodal::prepare_messages_for_provider_with_provider_hint( + history, + multimodal_config, + Some(provider_name), + ) + .await?; let mut request_messages = prepared_messages.messages.clone(); if let Some(prompt) = missing_tool_call_retry_prompt.take() { request_messages.push(ChatMessage::user(prompt)); diff --git a/src/channels/lark.rs b/src/channels/lark.rs index 5c202f284..d378545e8 100644 --- a/src/channels/lark.rs +++ b/src/channels/lark.rs @@ -5,6 +5,7 @@ use base64::Engine; use futures_util::{SinkExt, StreamExt}; use prost::Message as ProstMessage; use std::collections::HashMap; +use std::path::Path; use std::sync::{Arc, RwLock as StdRwLock}; use std::time::{Duration, Instant}; use tokio::sync::RwLock; @@ -304,6 +305,146 @@ fn parse_image_key_value(content: &serde_json::Value) -> Option { } } +fn is_image_filename(path_like: &str) -> bool { + let normalized = path_like + .split('?') + .next() + .unwrap_or(path_like) + .split('#') + .next() + .unwrap_or(path_like) + .to_ascii_lowercase(); + + normalized.ends_with(".png") + || normalized.ends_with(".jpg") + || normalized.ends_with(".jpeg") + || normalized.ends_with(".gif") + || normalized.ends_with(".webp") + || normalized.ends_with(".bmp") + || normalized.ends_with(".heic") + || normalized.ends_with(".heif") + || normalized.ends_with(".svg") +} + +fn parse_image_marker_line(line: &str) -> Option<&str> { + let trimmed = line.trim(); + let marker = trimmed.strip_prefix("[IMAGE:")?.strip_suffix(']')?.trim(); + if marker.is_empty() { + return None; + } + Some(marker) +} + +fn is_data_image_uri(target: &str) -> bool { + let lower = target.trim().to_ascii_lowercase(); + lower.starts_with("data:image/") && lower.contains(";base64,") +} + +fn extract_local_image_path_line(line: &str) -> Option { + let trimmed = line.trim(); + if trimmed.is_empty() { + return None; + } + + let candidate = trimmed.trim_matches(|c| matches!(c, '`' | '"' | '\'')); + let candidate = candidate.strip_prefix("file://").unwrap_or(candidate); + if candidate.is_empty() || candidate.contains('\0') { + return None; + } + + if !is_image_filename(candidate) { + return None; + } + + let path = Path::new(candidate); + if !path.is_file() { + return None; + } + + Some(candidate.to_string()) +} + +fn parse_outgoing_content(content: &str) -> (String, Vec) { + let mut text_lines = Vec::new(); + let mut image_targets = Vec::new(); + + for line in content.lines() { + if let Some(marker_target) = parse_image_marker_line(line) { + image_targets.push(marker_target.to_string()); + continue; + } + + let trimmed = line.trim(); + if is_data_image_uri(trimmed) { + image_targets.push(trimmed.to_string()); + continue; + } + + if let Some(local_path) = extract_local_image_path_line(line) { + image_targets.push(local_path); + continue; + } + + text_lines.push(line); + } + + (text_lines.join("\n").trim().to_string(), image_targets) +} + +fn decode_data_image_uri(source: &str) -> anyhow::Result<(Vec, String)> { + let trimmed = source.trim(); + let (header, payload) = trimmed + .split_once(',') + .ok_or_else(|| anyhow::anyhow!("invalid data URI: missing comma separator"))?; + + let lower_header = header.to_ascii_lowercase(); + if !lower_header.starts_with("data:image/") { + anyhow::bail!("unsupported data URI mime (expected image/*): {header}"); + } + if !lower_header.contains(";base64") { + anyhow::bail!("unsupported data URI encoding (expected base64): {header}"); + } + + let mime = header + .trim_start_matches("data:") + .split(';') + .next() + .unwrap_or("image/png") + .trim() + .to_ascii_lowercase(); + + let bytes = base64::engine::general_purpose::STANDARD + .decode(payload.trim()) + .map_err(|e| anyhow::anyhow!("invalid data URI base64 payload: {e}"))?; + if bytes.is_empty() { + anyhow::bail!("image payload is empty"); + } + + Ok((bytes, mime)) +} + +fn image_extension_from_mime(mime: &str) -> &'static str { + match mime { + "image/jpeg" => "jpg", + "image/gif" => "gif", + "image/webp" => "webp", + "image/bmp" => "bmp", + "image/svg+xml" => "svg", + "image/heic" => "heic", + "image/heif" => "heif", + _ => "png", + } +} + +fn display_image_target(target: &str) -> String { + let trimmed = target.trim(); + if is_data_image_uri(trimmed) { + "[inline image data]".to_string() + } else { + trimmed.to_string() + } +} + fn extract_lark_token_ttl_seconds(body: &serde_json::Value) -> u64 { let ttl = body .get("expire") @@ -1207,6 +1348,256 @@ impl LarkChannel { } } + fn image_upload_url(&self) -> String { + format!("{}/im/v1/images", self.api_base()) + } + + async fn send_image_once( + &self, + url: &str, + token: &str, + recipient: &str, + image_key: &str, + ) -> anyhow::Result<(reqwest::StatusCode, serde_json::Value)> { + let content = serde_json::json!({ "image_key": image_key }).to_string(); + let body = serde_json::json!({ + "receive_id": recipient, + "msg_type": "image", + "content": content, + }); + + self.send_text_once(url, token, &body).await + } + + async fn upload_image_once( + &self, + url: &str, + token: &str, + bytes: Vec, + file_name: &str, + ) -> anyhow::Result<(reqwest::StatusCode, serde_json::Value)> { + let part = reqwest::multipart::Part::bytes(bytes).file_name(file_name.to_string()); + let form = reqwest::multipart::Form::new() + .text("image_type", "message") + .part("image", part); + + let resp = self + .http_client() + .post(url) + .header("Authorization", format!("Bearer {token}")) + .multipart(form) + .send() + .await?; + let status = resp.status(); + let raw = resp.text().await.unwrap_or_default(); + let parsed = serde_json::from_str::(&raw) + .unwrap_or_else(|_| serde_json::json!({ "raw": raw })); + Ok((status, parsed)) + } + + async fn resolve_outgoing_image_target( + &self, + target: &str, + ) -> anyhow::Result<(Vec, String, String)> { + let trimmed = target.trim(); + + if is_data_image_uri(trimmed) { + let (bytes, mime) = decode_data_image_uri(trimmed)?; + let ext = image_extension_from_mime(&mime); + return Ok((bytes, format!("image.{ext}"), mime)); + } + + if trimmed.starts_with("http://") || trimmed.starts_with("https://") { + let resp = self.http_client().get(trimmed).send().await?; + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + let sanitized = crate::providers::sanitize_api_error(&body); + anyhow::bail!( + "failed to fetch remote image {trimmed}: status={status}, body={sanitized}" + ); + } + + let content_type = resp + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.split(';').next()) + .map(str::trim) + .map(|value| value.to_ascii_lowercase()); + + let path_like = trimmed + .split('?') + .next() + .unwrap_or(trimmed) + .split('#') + .next() + .unwrap_or(trimmed); + let guessed_mime = mime_guess::from_path(path_like) + .first_raw() + .unwrap_or("image/png") + .to_string(); + + let mime = content_type.unwrap_or(guessed_mime); + if !mime.starts_with("image/") { + anyhow::bail!("remote target is not an image: {trimmed}"); + } + + let file_name = path_like + .rsplit('/') + .next() + .filter(|value| !value.trim().is_empty()) + .map(ToOwned::to_owned) + .unwrap_or_else(|| format!("image.{}", image_extension_from_mime(&mime))); + + let bytes = resp.bytes().await?.to_vec(); + if bytes.is_empty() { + anyhow::bail!("remote image payload is empty: {trimmed}"); + } + + return Ok((bytes, file_name, mime)); + } + + let local_path = trimmed.strip_prefix("file://").unwrap_or(trimmed); + let path = Path::new(local_path); + if !path.is_file() { + anyhow::bail!("local image path not found: {local_path}"); + } + + let mime = mime_guess::from_path(path) + .first_raw() + .unwrap_or("image/png") + .to_string(); + if !mime.starts_with("image/") { + anyhow::bail!("local image path is not an image: {local_path}"); + } + + let bytes = tokio::fs::read(path) + .await + .map_err(|e| anyhow::anyhow!("failed to read local image {local_path}: {e}"))?; + if bytes.is_empty() { + anyhow::bail!("local image payload is empty: {local_path}"); + } + + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .filter(|name| !name.trim().is_empty()) + .map(ToOwned::to_owned) + .unwrap_or_else(|| format!("image.{}", image_extension_from_mime(&mime))); + + Ok((bytes, file_name, mime)) + } + + async fn send_text_with_retry( + &self, + url: &str, + body: &serde_json::Value, + ) -> anyhow::Result<()> { + let token = self.get_tenant_access_token().await?; + let (status, response) = self.send_text_once(url, &token, body).await?; + + if should_refresh_lark_tenant_token(status, &response) { + self.invalidate_token().await; + let new_token = self.get_tenant_access_token().await?; + let (retry_status, retry_response) = self.send_text_once(url, &new_token, body).await?; + + if should_refresh_lark_tenant_token(retry_status, &retry_response) { + let sanitized = sanitize_lark_body(&retry_response); + anyhow::bail!( + "Lark send failed after token refresh: status={retry_status}, body={sanitized}" + ); + } + + ensure_lark_send_success(retry_status, &retry_response, "after token refresh")?; + return Ok(()); + } + + ensure_lark_send_success(status, &response, "without token refresh")?; + Ok(()) + } + + async fn send_image_target_with_retry( + &self, + message_url: &str, + recipient: &str, + image_target: &str, + ) -> anyhow::Result<()> { + let upload_url = self.image_upload_url(); + let (image_bytes, file_name, _mime) = + self.resolve_outgoing_image_target(image_target).await?; + + let mut token = self.get_tenant_access_token().await?; + let (status, mut upload_response) = self + .upload_image_once(&upload_url, &token, image_bytes.clone(), &file_name) + .await?; + + if should_refresh_lark_tenant_token(status, &upload_response) { + self.invalidate_token().await; + token = self.get_tenant_access_token().await?; + let (retry_status, retry_response) = self + .upload_image_once(&upload_url, &token, image_bytes, &file_name) + .await?; + upload_response = retry_response; + + if should_refresh_lark_tenant_token(retry_status, &upload_response) { + let sanitized = sanitize_lark_body(&upload_response); + anyhow::bail!( + "Lark image upload failed after token refresh: status={retry_status}, body={sanitized}" + ); + } + + ensure_lark_send_success( + retry_status, + &upload_response, + "image upload after token refresh", + )?; + } else { + ensure_lark_send_success( + status, + &upload_response, + "image upload without token refresh", + )?; + } + + let image_key = upload_response + .pointer("/data/image_key") + .and_then(|value| value.as_str()) + .map(str::trim) + .filter(|value| !value.is_empty()) + .ok_or_else(|| anyhow::anyhow!("Lark image upload response missing data.image_key"))?; + + let (send_status, send_response) = self + .send_image_once(message_url, &token, recipient, image_key) + .await?; + if should_refresh_lark_tenant_token(send_status, &send_response) { + self.invalidate_token().await; + let new_token = self.get_tenant_access_token().await?; + let (retry_status, retry_response) = self + .send_image_once(message_url, &new_token, recipient, image_key) + .await?; + if should_refresh_lark_tenant_token(retry_status, &retry_response) { + let sanitized = sanitize_lark_body(&retry_response); + anyhow::bail!( + "Lark image send failed after token refresh: status={retry_status}, body={sanitized}" + ); + } + ensure_lark_send_success( + retry_status, + &retry_response, + "image send after token refresh", + )?; + return Ok(()); + } + + ensure_lark_send_success( + send_status, + &send_response, + "image send without token refresh", + )?; + Ok(()) + } + async fn send_text_once( &self, url: &str, @@ -1509,37 +1900,41 @@ impl Channel for LarkChannel { } async fn send(&self, message: &SendMessage) -> anyhow::Result<()> { - let token = self.get_tenant_access_token().await?; let url = self.send_message_url(); + let (text_content, image_targets) = parse_outgoing_content(&message.content); - let content = serde_json::json!({ "text": message.content }).to_string(); - let body = serde_json::json!({ - "receive_id": message.recipient, - "msg_type": "text", - "content": content, - }); - - let (status, response) = self.send_text_once(&url, &token, &body).await?; - - if should_refresh_lark_tenant_token(status, &response) { - // Token expired/invalid, invalidate and retry once. - self.invalidate_token().await; - let new_token = self.get_tenant_access_token().await?; - let (retry_status, retry_response) = - self.send_text_once(&url, &new_token, &body).await?; - - if should_refresh_lark_tenant_token(retry_status, &retry_response) { - let sanitized = sanitize_lark_body(&retry_response); - anyhow::bail!( - "Lark send failed after token refresh: status={retry_status}, body={sanitized}" - ); - } - - ensure_lark_send_success(retry_status, &retry_response, "after token refresh")?; - return Ok(()); + if !text_content.is_empty() { + let content = serde_json::json!({ "text": text_content }).to_string(); + let body = serde_json::json!({ + "receive_id": message.recipient, + "msg_type": "text", + "content": content, + }); + self.send_text_with_retry(&url, &body).await?; + } + + for image_target in image_targets { + if let Err(err) = self + .send_image_target_with_retry(&url, &message.recipient, &image_target) + .await + { + tracing::warn!( + "Lark image send failed for target '{}': {err}", + display_image_target(&image_target) + ); + let fallback = serde_json::json!({ + "text": format!("Image: {}", display_image_target(&image_target)) + }) + .to_string(); + let body = serde_json::json!({ + "receive_id": message.recipient, + "msg_type": "text", + "content": fallback, + }); + let _ = self.send_text_with_retry(&url, &body).await; + } } - ensure_lark_send_success(status, &response, "without token refresh")?; Ok(()) } @@ -2117,6 +2512,38 @@ mod tests { assert_eq!(ch.name(), "lark"); } + #[test] + fn lark_parse_outgoing_content_extracts_image_markers_and_local_path_lines() { + let temp = tempfile::tempdir().expect("temp dir"); + let image_path = temp.path().join("capture.png"); + std::fs::write(&image_path, b"png-bytes").expect("write image"); + + let input = format!( + "处理好了\n[IMAGE:https://cdn.example.com/a.png]\n{}\n/path/does/not/exist.png", + image_path.display() + ); + let (text, images) = parse_outgoing_content(&input); + + assert_eq!(text, "处理好了\n/path/does/not/exist.png"); + assert_eq!( + images, + vec![ + "https://cdn.example.com/a.png".to_string(), + image_path.display().to_string() + ] + ); + } + + #[test] + fn lark_parse_outgoing_content_extracts_data_uri_lines() { + let data_uri = "data:image/png;base64,aGVsbG8="; + let input = format!("这是一张图\n{data_uri}"); + let (text, images) = parse_outgoing_content(&input); + + assert_eq!(text, "这是一张图"); + assert_eq!(images, vec![data_uri.to_string()]); + } + #[test] fn lark_ws_activity_refreshes_heartbeat_watchdog() { assert!(should_refresh_last_recv(&WsMsg::Binary( diff --git a/src/channels/mod.rs b/src/channels/mod.rs index 47f7c4d8e..a717a7d80 100644 --- a/src/channels/mod.rs +++ b/src/channels/mod.rs @@ -543,6 +543,24 @@ fn channel_delivery_instructions(channel_name: &str) -> Option<&'static str> { - You can combine text and media in one response — text is sent first, then each attachment.\n\ - Use tool results silently: answer the latest user message directly, and do not narrate delayed/internal tool execution bookkeeping.", ), + "lark" | "feishu" => Some( + "When responding on Lark/Feishu:\n\ + - For image attachments, use markers: [IMAGE:]\n\ + - Keep normal text outside markers and never wrap markers in code fences.\n\ + - Prefer one marker per line to keep delivery deterministic.\n\ + - If you include both text and images, put text first, then image markers.\n\ + - Be concise and direct. Skip filler phrases.\n\ + - Use tool results silently: answer the latest user message directly, and do not narrate delayed/internal tool execution bookkeeping.", + ), + "qq" => Some( + "When responding on QQ:\n\ + - For image attachments, use markers: [IMAGE:]\n\ + - Keep normal text outside markers and never wrap markers in code fences.\n\ + - Prefer one marker per line to keep delivery deterministic.\n\ + - If you include both text and images, put text first, then image markers.\n\ + - Be concise and direct. Skip filler phrases.\n\ + - Use tool results silently: answer the latest user message directly, and do not narrate delayed/internal tool execution bookkeeping.", + ), "bluebubbles" => Some( "You are responding on iMessage via BlueBubbles. Always complete your research before replying — use as many tool calls as needed to get a full, accurate answer.\n\ \n\ diff --git a/src/channels/qq.rs b/src/channels/qq.rs index 23937e421..3fbf474da 100644 --- a/src/channels/qq.rs +++ b/src/channels/qq.rs @@ -1,10 +1,12 @@ use super::traits::{Channel, ChannelMessage, SendMessage}; use crate::config::schema::QQEnvironment; use async_trait::async_trait; +use base64::Engine; use futures_util::{SinkExt, StreamExt}; use ring::signature::Ed25519KeyPair; use serde_json::{json, Map, Value}; use std::collections::HashSet; +use std::path::Path; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; @@ -29,6 +31,11 @@ fn is_remote_media_url(url: &str) -> bool { trimmed.starts_with("https://") || trimmed.starts_with("http://") } +fn is_data_image_uri(target: &str) -> bool { + let lower = target.trim().to_ascii_lowercase(); + lower.starts_with("data:image/") && lower.contains(";base64,") +} + fn is_image_filename(filename: &str) -> bool { let lower = filename.to_ascii_lowercase(); lower.ends_with(".png") @@ -42,6 +49,25 @@ fn is_image_filename(filename: &str) -> bool { || lower.ends_with(".svg") } +#[derive(Debug, Clone, PartialEq, Eq)] +enum OutgoingImageTarget { + RemoteUrl(String), + LocalPath(String), + DataUri(String), +} + +impl OutgoingImageTarget { + fn display_target(&self) -> &str { + match self { + Self::RemoteUrl(url) | Self::LocalPath(url) | Self::DataUri(url) => url, + } + } + + fn is_inline_data(&self) -> bool { + matches!(self, Self::DataUri(_)) + } +} + fn extract_image_marker_from_attachment(attachment: &serde_json::Value) -> Option { let url = attachment.get("url").and_then(|u| u.as_str())?.trim(); if url.is_empty() { @@ -75,21 +101,97 @@ fn parse_image_marker_line(line: &str) -> Option<&str> { Some(marker) } -fn parse_outgoing_content(content: &str) -> (String, Vec) { +fn parse_outgoing_image_target( + candidate: &str, + allow_extensionless_remote_url: bool, +) -> Option { + let trimmed = candidate.trim(); + if trimmed.is_empty() || trimmed.contains('\0') { + return None; + } + + let normalized = trimmed.trim_matches(|c| matches!(c, '`' | '"' | '\'')); + let normalized = normalized.strip_prefix("file://").unwrap_or(normalized); + if normalized.is_empty() { + return None; + } + + if is_data_image_uri(normalized) { + return Some(OutgoingImageTarget::DataUri(normalized.to_string())); + } + + if is_remote_media_url(normalized) { + if allow_extensionless_remote_url || is_image_filename(normalized) { + return Some(OutgoingImageTarget::RemoteUrl(normalized.to_string())); + } + return None; + } + + if !is_image_filename(normalized) { + return None; + } + + let path = Path::new(normalized); + if !path.is_file() { + return None; + } + + Some(OutgoingImageTarget::LocalPath(normalized.to_string())) +} + +fn parse_outgoing_content(content: &str) -> (String, Vec) { let mut passthrough_lines = Vec::new(); - let mut image_urls = Vec::new(); + let mut image_targets = Vec::new(); for line in content.lines() { if let Some(marker_target) = parse_image_marker_line(line) { - if is_remote_media_url(marker_target) { - image_urls.push(marker_target.to_string()); + if let Some(parsed) = parse_outgoing_image_target(marker_target, true) { + image_targets.push(parsed); continue; } } + + if let Some(parsed) = parse_outgoing_image_target(line, false) { + if matches!( + parsed, + OutgoingImageTarget::LocalPath(_) | OutgoingImageTarget::DataUri(_) + ) { + image_targets.push(parsed); + continue; + } + } + passthrough_lines.push(line); } - (passthrough_lines.join("\n").trim().to_string(), image_urls) + ( + passthrough_lines.join("\n").trim().to_string(), + image_targets, + ) +} + +fn decode_data_image_payload(data_uri: &str) -> anyhow::Result { + let trimmed = data_uri.trim(); + let (header, payload) = trimmed + .split_once(',') + .ok_or_else(|| anyhow::anyhow!("invalid data URI: missing comma separator"))?; + + let lower_header = header.to_ascii_lowercase(); + if !lower_header.starts_with("data:image/") { + anyhow::bail!("unsupported data URI mime (expected image/*): {header}"); + } + if !lower_header.contains(";base64") { + anyhow::bail!("unsupported data URI encoding (expected base64): {header}"); + } + + let decoded = base64::engine::general_purpose::STANDARD + .decode(payload.trim()) + .map_err(|e| anyhow::anyhow!("invalid data URI base64 payload: {e}"))?; + if decoded.is_empty() { + anyhow::bail!("image payload is empty"); + } + + Ok(base64::engine::general_purpose::STANDARD.encode(decoded)) } fn compose_message_content(payload: &serde_json::Value) -> Option { @@ -480,6 +582,48 @@ impl QQChannel { Ok(file_info.to_string()) } + async fn upload_media_file_data( + &self, + token: &str, + files_url: &str, + file_data_base64: &str, + ) -> anyhow::Result { + ensure_https(files_url)?; + + let upload_body = json!({ + "file_type": 1, + "file_data": file_data_base64, + "srv_send_msg": false + }); + + let resp = self + .http_client() + .post(files_url) + .header("Authorization", format!("QQBot {token}")) + .json(&upload_body) + .send() + .await?; + + if !resp.status().is_success() { + let status = resp.status(); + let err = resp.text().await.unwrap_or_default(); + let sanitized = crate::providers::sanitize_api_error(&err); + anyhow::bail!("QQ upload media(file_data) failed ({status}): {sanitized}"); + } + + let payload: Value = resp.json().await?; + let file_info = payload + .get("file_info") + .and_then(Value::as_str) + .map(str::trim) + .filter(|value| !value.is_empty()) + .ok_or_else(|| { + anyhow::anyhow!("QQ upload media(file_data) response missing file_info") + })?; + + Ok(file_info.to_string()) + } + /// Fetch an access token from QQ's OAuth2 endpoint. async fn fetch_access_token(&self) -> anyhow::Result<(String, u64)> { let body = json!({ @@ -627,15 +771,68 @@ impl Channel for QQChannel { } } - for image_url in image_urls { - let file_info = self - .upload_media_file_info(&token, &files_url, &image_url) - .await?; - let media_body = build_media_message_body(&file_info, passive_msg_id, msg_seq); - self.post_json(&token, &message_url, &media_body, "send message") - .await?; - if passive_msg_id.is_some() { - msg_seq += 1; + for image_target in image_urls { + let file_info = match &image_target { + OutgoingImageTarget::RemoteUrl(image_url) => { + self.upload_media_file_info(&token, &files_url, image_url) + .await + } + OutgoingImageTarget::LocalPath(path) => match tokio::fs::read(path).await { + Ok(bytes) => { + if bytes.is_empty() { + Err(anyhow::anyhow!("QQ local image payload is empty: {path}")) + } else { + let encoded = base64::engine::general_purpose::STANDARD.encode(bytes); + self.upload_media_file_data(&token, &files_url, &encoded) + .await + } + } + Err(e) => Err(anyhow::anyhow!("QQ local image read failed ({path}): {e}")), + }, + OutgoingImageTarget::DataUri(data_uri) => { + match decode_data_image_payload(data_uri) { + Ok(encoded) => { + self.upload_media_file_data(&token, &files_url, &encoded) + .await + } + Err(err) => Err(err), + } + } + }; + + match file_info { + Ok(file_info) => { + let media_body = build_media_message_body(&file_info, passive_msg_id, msg_seq); + self.post_json(&token, &message_url, &media_body, "send message") + .await?; + if passive_msg_id.is_some() { + msg_seq += 1; + } + } + Err(err) => { + tracing::warn!( + "QQ: failed to upload image target '{}': {err}", + if image_target.is_inline_data() { + "[inline image data]" + } else { + image_target.display_target() + } + ); + let fallback_text = if image_target.is_inline_data() { + "Image attachment upload failed".to_string() + } else { + format!("Image: {}", image_target.display_target()) + }; + if let Some(body) = + build_text_message_body(&fallback_text, passive_msg_id, msg_seq) + { + self.post_json(&token, &message_url, &body, "send message") + .await?; + if passive_msg_id.is_some() { + msg_seq += 1; + } + } + } } } @@ -1073,12 +1270,26 @@ allowed_users = ["user1"] assert_eq!( images, vec![ - "https://cdn.example.com/a.png".to_string(), - "http://cdn.example.com/b.jpg".to_string() + OutgoingImageTarget::RemoteUrl("https://cdn.example.com/a.png".to_string()), + OutgoingImageTarget::RemoteUrl("http://cdn.example.com/b.jpg".to_string()) ] ); } + #[test] + fn test_parse_outgoing_content_accepts_marker_remote_url_without_extension() { + let input = "hello\n[IMAGE:https://multimedia.nt.qq.com.cn/download?appid=1406]\nbye"; + let (text, images) = parse_outgoing_content(input); + + assert_eq!(text, "hello\nbye"); + assert_eq!( + images, + vec![OutgoingImageTarget::RemoteUrl( + "https://multimedia.nt.qq.com.cn/download?appid=1406".to_string() + )] + ); + } + #[test] fn test_parse_outgoing_content_keeps_non_remote_image_marker_as_text() { let input = "[IMAGE:/tmp/a.png]\nhello"; @@ -1088,6 +1299,38 @@ allowed_users = ["user1"] assert!(images.is_empty()); } + #[test] + fn test_parse_outgoing_content_extracts_existing_local_path_lines() { + let temp = tempfile::tempdir().expect("temp dir"); + let local_path = temp.path().join("capture.png"); + std::fs::write(&local_path, b"png-bytes").expect("write local image"); + + let input = format!("done\n{}\nnext", local_path.display()); + let (text, images) = parse_outgoing_content(&input); + + assert_eq!(text, "done\nnext"); + assert_eq!( + images, + vec![OutgoingImageTarget::LocalPath( + local_path.display().to_string() + )] + ); + } + + #[test] + fn test_parse_outgoing_content_extracts_data_uri_markers() { + let input = "hello\n[IMAGE:data:image/png;base64,aGVsbG8=]\nbye"; + let (text, images) = parse_outgoing_content(input); + + assert_eq!(text, "hello\nbye"); + assert_eq!( + images, + vec![OutgoingImageTarget::DataUri( + "data:image/png;base64,aGVsbG8=".to_string() + )] + ); + } + #[test] fn test_build_text_message_body_with_passive_fields() { let body = build_text_message_body("hello", Some("msg-123"), 2).expect("text body"); diff --git a/src/config/schema.rs b/src/config/schema.rs index 451bcaf94..2f5fc3e97 100644 --- a/src/config/schema.rs +++ b/src/config/schema.rs @@ -139,6 +139,7 @@ const SUPPORTED_PROXY_SERVICE_KEYS: &[&str] = &[ "tool.browser", "tool.composio", "tool.http_request", + "tool.multimodal", "tool.pushover", "memory.embeddings", "tunnel.custom", diff --git a/src/gateway/mod.rs b/src/gateway/mod.rs index 8dbfd574a..4d2757441 100644 --- a/src/gateway/mod.rs +++ b/src/gateway/mod.rs @@ -1068,9 +1068,16 @@ async fn prepare_gateway_messages_for_provider( messages.push(ChatMessage::system(system_prompt)); messages.extend(user_messages); - let multimodal_config = state.config.lock().multimodal.clone(); - let prepared = - crate::multimodal::prepare_messages_for_provider(&messages, &multimodal_config).await?; + let (multimodal_config, provider_hint) = { + let config = state.config.lock(); + (config.multimodal.clone(), config.default_provider.clone()) + }; + let prepared = crate::multimodal::prepare_messages_for_provider_with_provider_hint( + &messages, + &multimodal_config, + provider_hint.as_deref(), + ) + .await?; Ok(prepared.messages) } diff --git a/src/multimodal.rs b/src/multimodal.rs index 50722dc6b..9d43a629d 100644 --- a/src/multimodal.rs +++ b/src/multimodal.rs @@ -8,6 +8,10 @@ use std::path::Path; const IMAGE_MARKER_PREFIX: &str = "[IMAGE:"; const OPTIMIZED_IMAGE_MAX_DIMENSION: u32 = 512; const OPTIMIZED_IMAGE_TARGET_BYTES: usize = 256 * 1024; +const REMOTE_FETCH_MULTIMODAL_SERVICE_KEY: &str = "tool.multimodal"; +const REMOTE_FETCH_TOOL_SERVICE_KEY: &str = "tool.http_request"; +const REMOTE_FETCH_QQ_SERVICE_KEY: &str = "channel.qq"; +const REMOTE_FETCH_LEGACY_SERVICE_KEY: &str = "provider.ollama"; const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &[ "image/png", "image/jpeg", @@ -118,6 +122,14 @@ pub fn extract_ollama_image_payload(image_ref: &str) -> Option { pub async fn prepare_messages_for_provider( messages: &[ChatMessage], config: &MultimodalConfig, +) -> anyhow::Result { + prepare_messages_for_provider_with_provider_hint(messages, config, None).await +} + +pub async fn prepare_messages_for_provider_with_provider_hint( + messages: &[ChatMessage], + config: &MultimodalConfig, + provider_hint: Option<&str>, ) -> anyhow::Result { let (max_images, max_image_size_mb) = config.effective_limits(); let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024); @@ -138,8 +150,6 @@ pub async fn prepare_messages_for_provider( }); } - let remote_client = build_runtime_proxy_client_with_timeouts("provider.ollama", 30, 10); - let mut normalized_messages = Vec::with_capacity(messages.len()); for message in messages { if message.role != "user" { @@ -156,7 +166,7 @@ pub async fn prepare_messages_for_provider( let mut normalized_refs = Vec::with_capacity(refs.len()); for reference in refs { let data_uri = - normalize_image_reference(&reference, config, max_bytes, &remote_client).await?; + normalize_image_reference(&reference, config, max_bytes, provider_hint).await?; normalized_refs.push(data_uri); } @@ -198,7 +208,7 @@ async fn normalize_image_reference( source: &str, config: &MultimodalConfig, max_bytes: usize, - remote_client: &Client, + provider_hint: Option<&str>, ) -> anyhow::Result { if source.starts_with("data:") { return normalize_data_uri(source, max_bytes).await; @@ -212,7 +222,7 @@ async fn normalize_image_reference( .into()); } - return normalize_remote_image(source, max_bytes, remote_client).await; + return normalize_remote_image(source, max_bytes, provider_hint).await; } normalize_local_image(source, max_bytes).await @@ -266,24 +276,59 @@ async fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result, +) -> anyhow::Result { + let service_keys = build_remote_fetch_service_keys(source, provider_hint); + let mut failures = Vec::new(); + + for service_key in service_keys { + let client = build_runtime_proxy_client_with_timeouts(&service_key, 30, 10); + match normalize_remote_image_once(source, max_bytes, &client).await { + Ok(normalized) => return Ok(normalized), + Err(error) => { + let reason = error.to_string(); + tracing::debug!( + service_key = %service_key, + source = %source, + "multimodal remote fetch attempt failed: {reason}" + ); + failures.push(format!("{service_key}: {reason}")); + } + } + } + + Err(MultimodalError::RemoteFetchFailed { + input: source.to_string(), + reason: format!( + "{}; hint: when proxy.scope='services', include one of channel.qq/tool.multimodal/tool.http_request/provider.* as needed", + failures.join(" | ") + ), + } + .into()) +} + +async fn normalize_remote_image_once( source: &str, max_bytes: usize, remote_client: &Client, ) -> anyhow::Result { - let response = remote_client.get(source).send().await.map_err(|error| { - MultimodalError::RemoteFetchFailed { - input: source.to_string(), - reason: error.to_string(), - } - })?; + let mut request = remote_client + .get(source) + .header(reqwest::header::USER_AGENT, "ZeroClaw/1.0"); + if source_looks_like_qq_media(source) { + request = request.header(reqwest::header::REFERER, "https://qq.com/"); + } + + let response = request + .send() + .await + .map_err(|error| anyhow::anyhow!("error sending request for url ({source}): {error}"))?; let status = response.status(); if !status.is_success() { - return Err(MultimodalError::RemoteFetchFailed { - input: source.to_string(), - reason: format!("HTTP {status}"), - } - .into()); + anyhow::bail!("HTTP {status}"); } if let Some(content_length) = response.content_length() { @@ -300,10 +345,7 @@ async fn normalize_remote_image( let bytes = response .bytes() .await - .map_err(|error| MultimodalError::RemoteFetchFailed { - input: source.to_string(), - reason: error.to_string(), - })?; + .map_err(|error| anyhow::anyhow!("failed to read response body: {error}"))?; validate_size(source, bytes.len(), max_bytes)?; @@ -325,6 +367,72 @@ async fn normalize_remote_image( )) } +fn normalize_provider_service_key_hint(provider_hint: Option<&str>) -> Option { + let raw = provider_hint + .map(str::trim) + .filter(|candidate| !candidate.is_empty())? + .split('#') + .next() + .unwrap_or_default() + .trim() + .to_ascii_lowercase(); + + if raw.is_empty() { + return None; + } + + let candidate = if raw.starts_with("provider.") { + raw + } else { + format!("provider.{raw}") + }; + + if !candidate + .chars() + .all(|ch| ch.is_ascii_lowercase() || ch.is_ascii_digit() || matches!(ch, '.' | '_' | '-')) + { + return None; + } + + Some(candidate) +} + +fn source_looks_like_qq_media(source: &str) -> bool { + let Ok(parsed) = reqwest::Url::parse(source) else { + return false; + }; + + let Some(host) = parsed.host_str() else { + return false; + }; + + let host = host.to_ascii_lowercase(); + host == "multimedia.nt.qq.com.cn" || host.ends_with(".qq.com.cn") || host.ends_with(".qq.com") +} + +fn push_service_key_once(keys: &mut Vec, key: String) { + if !key.trim().is_empty() && !keys.iter().any(|existing| existing == &key) { + keys.push(key); + } +} + +fn build_remote_fetch_service_keys(source: &str, provider_hint: Option<&str>) -> Vec { + let mut keys = Vec::new(); + + if source_looks_like_qq_media(source) { + push_service_key_once(&mut keys, REMOTE_FETCH_QQ_SERVICE_KEY.to_string()); + } + + if let Some(provider_service_key) = normalize_provider_service_key_hint(provider_hint) { + push_service_key_once(&mut keys, provider_service_key); + } + + push_service_key_once(&mut keys, REMOTE_FETCH_MULTIMODAL_SERVICE_KEY.to_string()); + push_service_key_once(&mut keys, REMOTE_FETCH_TOOL_SERVICE_KEY.to_string()); + push_service_key_once(&mut keys, REMOTE_FETCH_LEGACY_SERVICE_KEY.to_string()); + keys +} + async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result { let path = Path::new(source); if !path.exists() || !path.is_file() { @@ -681,6 +789,63 @@ mod tests { assert!(optimized_image.height() <= OPTIMIZED_IMAGE_MAX_DIMENSION); } + #[test] + fn normalize_provider_service_key_hint_builds_provider_prefix() { + assert_eq!( + normalize_provider_service_key_hint(Some("openai")), + Some("provider.openai".to_string()) + ); + assert_eq!( + normalize_provider_service_key_hint(Some("provider.gemini")), + Some("provider.gemini".to_string()) + ); + assert_eq!(normalize_provider_service_key_hint(Some(" ")), None); + assert_eq!(normalize_provider_service_key_hint(None), None); + assert_eq!( + normalize_provider_service_key_hint(Some("openai#fast-route")), + Some("provider.openai".to_string()) + ); + assert_eq!( + normalize_provider_service_key_hint(Some("provider.gemini#img")), + Some("provider.gemini".to_string()) + ); + assert_eq!( + normalize_provider_service_key_hint(Some("custom:https://api.example.com/v1")), + None + ); + } + + #[test] + fn build_remote_fetch_service_keys_prefers_qq_channel_for_qq_media_hosts() { + let keys = build_remote_fetch_service_keys( + "https://multimedia.nt.qq.com.cn/download?appid=1406", + Some("openai"), + ); + assert_eq!( + keys, + vec![ + "channel.qq".to_string(), + "provider.openai".to_string(), + "tool.multimodal".to_string(), + "tool.http_request".to_string(), + "provider.ollama".to_string(), + ] + ); + } + + #[test] + fn build_remote_fetch_service_keys_deduplicates_service_candidates() { + let keys = build_remote_fetch_service_keys("https://example.com/a.png", Some("ollama")); + assert_eq!( + keys, + vec![ + "provider.ollama".to_string(), + "tool.multimodal".to_string(), + "tool.http_request".to_string(), + ] + ); + } + #[test] fn extract_ollama_image_payload_supports_data_uris() { let payload = extract_ollama_image_payload("data:image/png;base64,abcd==") diff --git a/src/providers/openai_codex.rs b/src/providers/openai_codex.rs index 6009e66dd..aeafd20af 100644 --- a/src/providers/openai_codex.rs +++ b/src/providers/openai_codex.rs @@ -1098,7 +1098,12 @@ impl Provider for OpenAiCodexProvider { // Normalize images: convert file paths to data URIs let config = crate::config::MultimodalConfig::default(); - let prepared = crate::multimodal::prepare_messages_for_provider(&messages, &config).await?; + let prepared = crate::multimodal::prepare_messages_for_provider_with_provider_hint( + &messages, + &config, + Some("openai"), + ) + .await?; let (instructions, input) = build_responses_input(&prepared.messages); self.send_responses_request(input, instructions, model) @@ -1113,7 +1118,12 @@ impl Provider for OpenAiCodexProvider { ) -> anyhow::Result { // Normalize image markers: convert file paths to data URIs let config = crate::config::MultimodalConfig::default(); - let prepared = crate::multimodal::prepare_messages_for_provider(messages, &config).await?; + let prepared = crate::multimodal::prepare_messages_for_provider_with_provider_hint( + messages, + &config, + Some("openai"), + ) + .await?; let (instructions, input) = build_responses_input(&prepared.messages); self.send_responses_request(input, instructions, model)