fix(channels): robust qq/feishu image delivery and multimodal proxy fetch routing

This commit is contained in:
Chummy 2026-03-02 14:49:51 +08:00 committed by Chum Yin
parent f18fac5b26
commit 61398eb900
10 changed files with 952 additions and 72 deletions

View File

@ -622,6 +622,11 @@ Notes:
- Remote URL only when `allow_remote_fetch = true`
- Allowed MIME types: `image/png`, `image/jpeg`, `image/webp`, `image/gif`, `image/bmp`.
- When the active provider does not support vision, requests fail with a structured capability error (`capability=vision`) instead of silently dropping images.
- In `proxy.scope = "services"` mode, remote image fetch uses service-key routing. For best compatibility include relevant selectors/keys such as:
- `channel.qq` (QQ media hosts like `multimedia.nt.qq.com.cn`)
- `tool.multimodal` (dedicated multimodal fetch path)
- `tool.http_request` (compatibility fallback path)
- `provider.*` or the active provider key (for example `provider.openai`)
## `[browser]`

View File

@ -113,14 +113,14 @@ Use when only part of the system should use proxy (for example specific provider
### 5.1 Target specific services
```json
{"action":"set","enabled":true,"scope":"services","services":["provider.openai","tool.http_request","channel.telegram"],"all_proxy":"socks5h://127.0.0.1:1080","no_proxy":["localhost","127.0.0.1",".internal"]}
{"action":"set","enabled":true,"scope":"services","services":["provider.openai","tool.multimodal","tool.http_request","channel.telegram"],"all_proxy":"socks5h://127.0.0.1:1080","no_proxy":["localhost","127.0.0.1",".internal"]}
{"action":"get"}
```
### 5.2 Target by selectors
```json
{"action":"set","enabled":true,"scope":"services","services":["provider.*","tool.*"],"http_proxy":"http://127.0.0.1:7890"}
{"action":"set","enabled":true,"scope":"services","services":["provider.*","tool.*","channel.qq"],"http_proxy":"http://127.0.0.1:7890"}
{"action":"get"}
```

View File

@ -1146,8 +1146,12 @@ pub async fn run_tool_call_loop(
.into());
}
let prepared_messages =
multimodal::prepare_messages_for_provider(history, multimodal_config).await?;
let prepared_messages = multimodal::prepare_messages_for_provider_with_provider_hint(
history,
multimodal_config,
Some(provider_name),
)
.await?;
let mut request_messages = prepared_messages.messages.clone();
if let Some(prompt) = missing_tool_call_retry_prompt.take() {
request_messages.push(ChatMessage::user(prompt));

View File

@ -5,6 +5,7 @@ use base64::Engine;
use futures_util::{SinkExt, StreamExt};
use prost::Message as ProstMessage;
use std::collections::HashMap;
use std::path::Path;
use std::sync::{Arc, RwLock as StdRwLock};
use std::time::{Duration, Instant};
use tokio::sync::RwLock;
@ -304,6 +305,146 @@ fn parse_image_key_value(content: &serde_json::Value) -> Option<String> {
}
}
fn is_image_filename(path_like: &str) -> bool {
let normalized = path_like
.split('?')
.next()
.unwrap_or(path_like)
.split('#')
.next()
.unwrap_or(path_like)
.to_ascii_lowercase();
normalized.ends_with(".png")
|| normalized.ends_with(".jpg")
|| normalized.ends_with(".jpeg")
|| normalized.ends_with(".gif")
|| normalized.ends_with(".webp")
|| normalized.ends_with(".bmp")
|| normalized.ends_with(".heic")
|| normalized.ends_with(".heif")
|| normalized.ends_with(".svg")
}
fn parse_image_marker_line(line: &str) -> Option<&str> {
let trimmed = line.trim();
let marker = trimmed.strip_prefix("[IMAGE:")?.strip_suffix(']')?.trim();
if marker.is_empty() {
return None;
}
Some(marker)
}
fn is_data_image_uri(target: &str) -> bool {
let lower = target.trim().to_ascii_lowercase();
lower.starts_with("data:image/") && lower.contains(";base64,")
}
fn extract_local_image_path_line(line: &str) -> Option<String> {
let trimmed = line.trim();
if trimmed.is_empty() {
return None;
}
let candidate = trimmed.trim_matches(|c| matches!(c, '`' | '"' | '\''));
let candidate = candidate.strip_prefix("file://").unwrap_or(candidate);
if candidate.is_empty() || candidate.contains('\0') {
return None;
}
if !is_image_filename(candidate) {
return None;
}
let path = Path::new(candidate);
if !path.is_file() {
return None;
}
Some(candidate.to_string())
}
fn parse_outgoing_content(content: &str) -> (String, Vec<String>) {
let mut text_lines = Vec::new();
let mut image_targets = Vec::new();
for line in content.lines() {
if let Some(marker_target) = parse_image_marker_line(line) {
image_targets.push(marker_target.to_string());
continue;
}
let trimmed = line.trim();
if is_data_image_uri(trimmed) {
image_targets.push(trimmed.to_string());
continue;
}
if let Some(local_path) = extract_local_image_path_line(line) {
image_targets.push(local_path);
continue;
}
text_lines.push(line);
}
(text_lines.join("\n").trim().to_string(), image_targets)
}
fn decode_data_image_uri(source: &str) -> anyhow::Result<(Vec<u8>, String)> {
let trimmed = source.trim();
let (header, payload) = trimmed
.split_once(',')
.ok_or_else(|| anyhow::anyhow!("invalid data URI: missing comma separator"))?;
let lower_header = header.to_ascii_lowercase();
if !lower_header.starts_with("data:image/") {
anyhow::bail!("unsupported data URI mime (expected image/*): {header}");
}
if !lower_header.contains(";base64") {
anyhow::bail!("unsupported data URI encoding (expected base64): {header}");
}
let mime = header
.trim_start_matches("data:")
.split(';')
.next()
.unwrap_or("image/png")
.trim()
.to_ascii_lowercase();
let bytes = base64::engine::general_purpose::STANDARD
.decode(payload.trim())
.map_err(|e| anyhow::anyhow!("invalid data URI base64 payload: {e}"))?;
if bytes.is_empty() {
anyhow::bail!("image payload is empty");
}
Ok((bytes, mime))
}
fn image_extension_from_mime(mime: &str) -> &'static str {
match mime {
"image/jpeg" => "jpg",
"image/gif" => "gif",
"image/webp" => "webp",
"image/bmp" => "bmp",
"image/svg+xml" => "svg",
"image/heic" => "heic",
"image/heif" => "heif",
_ => "png",
}
}
fn display_image_target(target: &str) -> String {
let trimmed = target.trim();
if is_data_image_uri(trimmed) {
"[inline image data]".to_string()
} else {
trimmed.to_string()
}
}
fn extract_lark_token_ttl_seconds(body: &serde_json::Value) -> u64 {
let ttl = body
.get("expire")
@ -1207,6 +1348,256 @@ impl LarkChannel {
}
}
fn image_upload_url(&self) -> String {
format!("{}/im/v1/images", self.api_base())
}
async fn send_image_once(
&self,
url: &str,
token: &str,
recipient: &str,
image_key: &str,
) -> anyhow::Result<(reqwest::StatusCode, serde_json::Value)> {
let content = serde_json::json!({ "image_key": image_key }).to_string();
let body = serde_json::json!({
"receive_id": recipient,
"msg_type": "image",
"content": content,
});
self.send_text_once(url, token, &body).await
}
async fn upload_image_once(
&self,
url: &str,
token: &str,
bytes: Vec<u8>,
file_name: &str,
) -> anyhow::Result<(reqwest::StatusCode, serde_json::Value)> {
let part = reqwest::multipart::Part::bytes(bytes).file_name(file_name.to_string());
let form = reqwest::multipart::Form::new()
.text("image_type", "message")
.part("image", part);
let resp = self
.http_client()
.post(url)
.header("Authorization", format!("Bearer {token}"))
.multipart(form)
.send()
.await?;
let status = resp.status();
let raw = resp.text().await.unwrap_or_default();
let parsed = serde_json::from_str::<serde_json::Value>(&raw)
.unwrap_or_else(|_| serde_json::json!({ "raw": raw }));
Ok((status, parsed))
}
async fn resolve_outgoing_image_target(
&self,
target: &str,
) -> anyhow::Result<(Vec<u8>, String, String)> {
let trimmed = target.trim();
if is_data_image_uri(trimmed) {
let (bytes, mime) = decode_data_image_uri(trimmed)?;
let ext = image_extension_from_mime(&mime);
return Ok((bytes, format!("image.{ext}"), mime));
}
if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
let resp = self.http_client().get(trimmed).send().await?;
if !resp.status().is_success() {
let status = resp.status();
let body = resp.text().await.unwrap_or_default();
let sanitized = crate::providers::sanitize_api_error(&body);
anyhow::bail!(
"failed to fetch remote image {trimmed}: status={status}, body={sanitized}"
);
}
let content_type = resp
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|value| value.to_str().ok())
.and_then(|value| value.split(';').next())
.map(str::trim)
.map(|value| value.to_ascii_lowercase());
let path_like = trimmed
.split('?')
.next()
.unwrap_or(trimmed)
.split('#')
.next()
.unwrap_or(trimmed);
let guessed_mime = mime_guess::from_path(path_like)
.first_raw()
.unwrap_or("image/png")
.to_string();
let mime = content_type.unwrap_or(guessed_mime);
if !mime.starts_with("image/") {
anyhow::bail!("remote target is not an image: {trimmed}");
}
let file_name = path_like
.rsplit('/')
.next()
.filter(|value| !value.trim().is_empty())
.map(ToOwned::to_owned)
.unwrap_or_else(|| format!("image.{}", image_extension_from_mime(&mime)));
let bytes = resp.bytes().await?.to_vec();
if bytes.is_empty() {
anyhow::bail!("remote image payload is empty: {trimmed}");
}
return Ok((bytes, file_name, mime));
}
let local_path = trimmed.strip_prefix("file://").unwrap_or(trimmed);
let path = Path::new(local_path);
if !path.is_file() {
anyhow::bail!("local image path not found: {local_path}");
}
let mime = mime_guess::from_path(path)
.first_raw()
.unwrap_or("image/png")
.to_string();
if !mime.starts_with("image/") {
anyhow::bail!("local image path is not an image: {local_path}");
}
let bytes = tokio::fs::read(path)
.await
.map_err(|e| anyhow::anyhow!("failed to read local image {local_path}: {e}"))?;
if bytes.is_empty() {
anyhow::bail!("local image payload is empty: {local_path}");
}
let file_name = path
.file_name()
.and_then(|name| name.to_str())
.filter(|name| !name.trim().is_empty())
.map(ToOwned::to_owned)
.unwrap_or_else(|| format!("image.{}", image_extension_from_mime(&mime)));
Ok((bytes, file_name, mime))
}
async fn send_text_with_retry(
&self,
url: &str,
body: &serde_json::Value,
) -> anyhow::Result<()> {
let token = self.get_tenant_access_token().await?;
let (status, response) = self.send_text_once(url, &token, body).await?;
if should_refresh_lark_tenant_token(status, &response) {
self.invalidate_token().await;
let new_token = self.get_tenant_access_token().await?;
let (retry_status, retry_response) = self.send_text_once(url, &new_token, body).await?;
if should_refresh_lark_tenant_token(retry_status, &retry_response) {
let sanitized = sanitize_lark_body(&retry_response);
anyhow::bail!(
"Lark send failed after token refresh: status={retry_status}, body={sanitized}"
);
}
ensure_lark_send_success(retry_status, &retry_response, "after token refresh")?;
return Ok(());
}
ensure_lark_send_success(status, &response, "without token refresh")?;
Ok(())
}
async fn send_image_target_with_retry(
&self,
message_url: &str,
recipient: &str,
image_target: &str,
) -> anyhow::Result<()> {
let upload_url = self.image_upload_url();
let (image_bytes, file_name, _mime) =
self.resolve_outgoing_image_target(image_target).await?;
let mut token = self.get_tenant_access_token().await?;
let (status, mut upload_response) = self
.upload_image_once(&upload_url, &token, image_bytes.clone(), &file_name)
.await?;
if should_refresh_lark_tenant_token(status, &upload_response) {
self.invalidate_token().await;
token = self.get_tenant_access_token().await?;
let (retry_status, retry_response) = self
.upload_image_once(&upload_url, &token, image_bytes, &file_name)
.await?;
upload_response = retry_response;
if should_refresh_lark_tenant_token(retry_status, &upload_response) {
let sanitized = sanitize_lark_body(&upload_response);
anyhow::bail!(
"Lark image upload failed after token refresh: status={retry_status}, body={sanitized}"
);
}
ensure_lark_send_success(
retry_status,
&upload_response,
"image upload after token refresh",
)?;
} else {
ensure_lark_send_success(
status,
&upload_response,
"image upload without token refresh",
)?;
}
let image_key = upload_response
.pointer("/data/image_key")
.and_then(|value| value.as_str())
.map(str::trim)
.filter(|value| !value.is_empty())
.ok_or_else(|| anyhow::anyhow!("Lark image upload response missing data.image_key"))?;
let (send_status, send_response) = self
.send_image_once(message_url, &token, recipient, image_key)
.await?;
if should_refresh_lark_tenant_token(send_status, &send_response) {
self.invalidate_token().await;
let new_token = self.get_tenant_access_token().await?;
let (retry_status, retry_response) = self
.send_image_once(message_url, &new_token, recipient, image_key)
.await?;
if should_refresh_lark_tenant_token(retry_status, &retry_response) {
let sanitized = sanitize_lark_body(&retry_response);
anyhow::bail!(
"Lark image send failed after token refresh: status={retry_status}, body={sanitized}"
);
}
ensure_lark_send_success(
retry_status,
&retry_response,
"image send after token refresh",
)?;
return Ok(());
}
ensure_lark_send_success(
send_status,
&send_response,
"image send without token refresh",
)?;
Ok(())
}
async fn send_text_once(
&self,
url: &str,
@ -1509,37 +1900,41 @@ impl Channel for LarkChannel {
}
async fn send(&self, message: &SendMessage) -> anyhow::Result<()> {
let token = self.get_tenant_access_token().await?;
let url = self.send_message_url();
let (text_content, image_targets) = parse_outgoing_content(&message.content);
let content = serde_json::json!({ "text": message.content }).to_string();
let body = serde_json::json!({
"receive_id": message.recipient,
"msg_type": "text",
"content": content,
});
let (status, response) = self.send_text_once(&url, &token, &body).await?;
if should_refresh_lark_tenant_token(status, &response) {
// Token expired/invalid, invalidate and retry once.
self.invalidate_token().await;
let new_token = self.get_tenant_access_token().await?;
let (retry_status, retry_response) =
self.send_text_once(&url, &new_token, &body).await?;
if should_refresh_lark_tenant_token(retry_status, &retry_response) {
let sanitized = sanitize_lark_body(&retry_response);
anyhow::bail!(
"Lark send failed after token refresh: status={retry_status}, body={sanitized}"
);
}
ensure_lark_send_success(retry_status, &retry_response, "after token refresh")?;
return Ok(());
if !text_content.is_empty() {
let content = serde_json::json!({ "text": text_content }).to_string();
let body = serde_json::json!({
"receive_id": message.recipient,
"msg_type": "text",
"content": content,
});
self.send_text_with_retry(&url, &body).await?;
}
for image_target in image_targets {
if let Err(err) = self
.send_image_target_with_retry(&url, &message.recipient, &image_target)
.await
{
tracing::warn!(
"Lark image send failed for target '{}': {err}",
display_image_target(&image_target)
);
let fallback = serde_json::json!({
"text": format!("Image: {}", display_image_target(&image_target))
})
.to_string();
let body = serde_json::json!({
"receive_id": message.recipient,
"msg_type": "text",
"content": fallback,
});
let _ = self.send_text_with_retry(&url, &body).await;
}
}
ensure_lark_send_success(status, &response, "without token refresh")?;
Ok(())
}
@ -2117,6 +2512,38 @@ mod tests {
assert_eq!(ch.name(), "lark");
}
#[test]
fn lark_parse_outgoing_content_extracts_image_markers_and_local_path_lines() {
let temp = tempfile::tempdir().expect("temp dir");
let image_path = temp.path().join("capture.png");
std::fs::write(&image_path, b"png-bytes").expect("write image");
let input = format!(
"处理好了\n[IMAGE:https://cdn.example.com/a.png]\n{}\n/path/does/not/exist.png",
image_path.display()
);
let (text, images) = parse_outgoing_content(&input);
assert_eq!(text, "处理好了\n/path/does/not/exist.png");
assert_eq!(
images,
vec![
"https://cdn.example.com/a.png".to_string(),
image_path.display().to_string()
]
);
}
#[test]
fn lark_parse_outgoing_content_extracts_data_uri_lines() {
let data_uri = "data:image/png;base64,aGVsbG8=";
let input = format!("这是一张图\n{data_uri}");
let (text, images) = parse_outgoing_content(&input);
assert_eq!(text, "这是一张图");
assert_eq!(images, vec![data_uri.to_string()]);
}
#[test]
fn lark_ws_activity_refreshes_heartbeat_watchdog() {
assert!(should_refresh_last_recv(&WsMsg::Binary(

View File

@ -543,6 +543,24 @@ fn channel_delivery_instructions(channel_name: &str) -> Option<&'static str> {
- You can combine text and media in one response text is sent first, then each attachment.\n\
- Use tool results silently: answer the latest user message directly, and do not narrate delayed/internal tool execution bookkeeping.",
),
"lark" | "feishu" => Some(
"When responding on Lark/Feishu:\n\
- For image attachments, use markers: [IMAGE:<path-or-url-or-data-uri>]\n\
- Keep normal text outside markers and never wrap markers in code fences.\n\
- Prefer one marker per line to keep delivery deterministic.\n\
- If you include both text and images, put text first, then image markers.\n\
- Be concise and direct. Skip filler phrases.\n\
- Use tool results silently: answer the latest user message directly, and do not narrate delayed/internal tool execution bookkeeping.",
),
"qq" => Some(
"When responding on QQ:\n\
- For image attachments, use markers: [IMAGE:<path-or-url-or-data-uri>]\n\
- Keep normal text outside markers and never wrap markers in code fences.\n\
- Prefer one marker per line to keep delivery deterministic.\n\
- If you include both text and images, put text first, then image markers.\n\
- Be concise and direct. Skip filler phrases.\n\
- Use tool results silently: answer the latest user message directly, and do not narrate delayed/internal tool execution bookkeeping.",
),
"bluebubbles" => Some(
"You are responding on iMessage via BlueBubbles. Always complete your research before replying — use as many tool calls as needed to get a full, accurate answer.\n\
\n\

View File

@ -1,10 +1,12 @@
use super::traits::{Channel, ChannelMessage, SendMessage};
use crate::config::schema::QQEnvironment;
use async_trait::async_trait;
use base64::Engine;
use futures_util::{SinkExt, StreamExt};
use ring::signature::Ed25519KeyPair;
use serde_json::{json, Map, Value};
use std::collections::HashSet;
use std::path::Path;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use tokio::sync::RwLock;
@ -29,6 +31,11 @@ fn is_remote_media_url(url: &str) -> bool {
trimmed.starts_with("https://") || trimmed.starts_with("http://")
}
fn is_data_image_uri(target: &str) -> bool {
let lower = target.trim().to_ascii_lowercase();
lower.starts_with("data:image/") && lower.contains(";base64,")
}
fn is_image_filename(filename: &str) -> bool {
let lower = filename.to_ascii_lowercase();
lower.ends_with(".png")
@ -42,6 +49,25 @@ fn is_image_filename(filename: &str) -> bool {
|| lower.ends_with(".svg")
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum OutgoingImageTarget {
RemoteUrl(String),
LocalPath(String),
DataUri(String),
}
impl OutgoingImageTarget {
fn display_target(&self) -> &str {
match self {
Self::RemoteUrl(url) | Self::LocalPath(url) | Self::DataUri(url) => url,
}
}
fn is_inline_data(&self) -> bool {
matches!(self, Self::DataUri(_))
}
}
fn extract_image_marker_from_attachment(attachment: &serde_json::Value) -> Option<String> {
let url = attachment.get("url").and_then(|u| u.as_str())?.trim();
if url.is_empty() {
@ -75,21 +101,97 @@ fn parse_image_marker_line(line: &str) -> Option<&str> {
Some(marker)
}
fn parse_outgoing_content(content: &str) -> (String, Vec<String>) {
fn parse_outgoing_image_target(
candidate: &str,
allow_extensionless_remote_url: bool,
) -> Option<OutgoingImageTarget> {
let trimmed = candidate.trim();
if trimmed.is_empty() || trimmed.contains('\0') {
return None;
}
let normalized = trimmed.trim_matches(|c| matches!(c, '`' | '"' | '\''));
let normalized = normalized.strip_prefix("file://").unwrap_or(normalized);
if normalized.is_empty() {
return None;
}
if is_data_image_uri(normalized) {
return Some(OutgoingImageTarget::DataUri(normalized.to_string()));
}
if is_remote_media_url(normalized) {
if allow_extensionless_remote_url || is_image_filename(normalized) {
return Some(OutgoingImageTarget::RemoteUrl(normalized.to_string()));
}
return None;
}
if !is_image_filename(normalized) {
return None;
}
let path = Path::new(normalized);
if !path.is_file() {
return None;
}
Some(OutgoingImageTarget::LocalPath(normalized.to_string()))
}
fn parse_outgoing_content(content: &str) -> (String, Vec<OutgoingImageTarget>) {
let mut passthrough_lines = Vec::new();
let mut image_urls = Vec::new();
let mut image_targets = Vec::new();
for line in content.lines() {
if let Some(marker_target) = parse_image_marker_line(line) {
if is_remote_media_url(marker_target) {
image_urls.push(marker_target.to_string());
if let Some(parsed) = parse_outgoing_image_target(marker_target, true) {
image_targets.push(parsed);
continue;
}
}
if let Some(parsed) = parse_outgoing_image_target(line, false) {
if matches!(
parsed,
OutgoingImageTarget::LocalPath(_) | OutgoingImageTarget::DataUri(_)
) {
image_targets.push(parsed);
continue;
}
}
passthrough_lines.push(line);
}
(passthrough_lines.join("\n").trim().to_string(), image_urls)
(
passthrough_lines.join("\n").trim().to_string(),
image_targets,
)
}
fn decode_data_image_payload(data_uri: &str) -> anyhow::Result<String> {
let trimmed = data_uri.trim();
let (header, payload) = trimmed
.split_once(',')
.ok_or_else(|| anyhow::anyhow!("invalid data URI: missing comma separator"))?;
let lower_header = header.to_ascii_lowercase();
if !lower_header.starts_with("data:image/") {
anyhow::bail!("unsupported data URI mime (expected image/*): {header}");
}
if !lower_header.contains(";base64") {
anyhow::bail!("unsupported data URI encoding (expected base64): {header}");
}
let decoded = base64::engine::general_purpose::STANDARD
.decode(payload.trim())
.map_err(|e| anyhow::anyhow!("invalid data URI base64 payload: {e}"))?;
if decoded.is_empty() {
anyhow::bail!("image payload is empty");
}
Ok(base64::engine::general_purpose::STANDARD.encode(decoded))
}
fn compose_message_content(payload: &serde_json::Value) -> Option<String> {
@ -480,6 +582,48 @@ impl QQChannel {
Ok(file_info.to_string())
}
async fn upload_media_file_data(
&self,
token: &str,
files_url: &str,
file_data_base64: &str,
) -> anyhow::Result<String> {
ensure_https(files_url)?;
let upload_body = json!({
"file_type": 1,
"file_data": file_data_base64,
"srv_send_msg": false
});
let resp = self
.http_client()
.post(files_url)
.header("Authorization", format!("QQBot {token}"))
.json(&upload_body)
.send()
.await?;
if !resp.status().is_success() {
let status = resp.status();
let err = resp.text().await.unwrap_or_default();
let sanitized = crate::providers::sanitize_api_error(&err);
anyhow::bail!("QQ upload media(file_data) failed ({status}): {sanitized}");
}
let payload: Value = resp.json().await?;
let file_info = payload
.get("file_info")
.and_then(Value::as_str)
.map(str::trim)
.filter(|value| !value.is_empty())
.ok_or_else(|| {
anyhow::anyhow!("QQ upload media(file_data) response missing file_info")
})?;
Ok(file_info.to_string())
}
/// Fetch an access token from QQ's OAuth2 endpoint.
async fn fetch_access_token(&self) -> anyhow::Result<(String, u64)> {
let body = json!({
@ -627,15 +771,68 @@ impl Channel for QQChannel {
}
}
for image_url in image_urls {
let file_info = self
.upload_media_file_info(&token, &files_url, &image_url)
.await?;
let media_body = build_media_message_body(&file_info, passive_msg_id, msg_seq);
self.post_json(&token, &message_url, &media_body, "send message")
.await?;
if passive_msg_id.is_some() {
msg_seq += 1;
for image_target in image_urls {
let file_info = match &image_target {
OutgoingImageTarget::RemoteUrl(image_url) => {
self.upload_media_file_info(&token, &files_url, image_url)
.await
}
OutgoingImageTarget::LocalPath(path) => match tokio::fs::read(path).await {
Ok(bytes) => {
if bytes.is_empty() {
Err(anyhow::anyhow!("QQ local image payload is empty: {path}"))
} else {
let encoded = base64::engine::general_purpose::STANDARD.encode(bytes);
self.upload_media_file_data(&token, &files_url, &encoded)
.await
}
}
Err(e) => Err(anyhow::anyhow!("QQ local image read failed ({path}): {e}")),
},
OutgoingImageTarget::DataUri(data_uri) => {
match decode_data_image_payload(data_uri) {
Ok(encoded) => {
self.upload_media_file_data(&token, &files_url, &encoded)
.await
}
Err(err) => Err(err),
}
}
};
match file_info {
Ok(file_info) => {
let media_body = build_media_message_body(&file_info, passive_msg_id, msg_seq);
self.post_json(&token, &message_url, &media_body, "send message")
.await?;
if passive_msg_id.is_some() {
msg_seq += 1;
}
}
Err(err) => {
tracing::warn!(
"QQ: failed to upload image target '{}': {err}",
if image_target.is_inline_data() {
"[inline image data]"
} else {
image_target.display_target()
}
);
let fallback_text = if image_target.is_inline_data() {
"Image attachment upload failed".to_string()
} else {
format!("Image: {}", image_target.display_target())
};
if let Some(body) =
build_text_message_body(&fallback_text, passive_msg_id, msg_seq)
{
self.post_json(&token, &message_url, &body, "send message")
.await?;
if passive_msg_id.is_some() {
msg_seq += 1;
}
}
}
}
}
@ -1073,12 +1270,26 @@ allowed_users = ["user1"]
assert_eq!(
images,
vec![
"https://cdn.example.com/a.png".to_string(),
"http://cdn.example.com/b.jpg".to_string()
OutgoingImageTarget::RemoteUrl("https://cdn.example.com/a.png".to_string()),
OutgoingImageTarget::RemoteUrl("http://cdn.example.com/b.jpg".to_string())
]
);
}
#[test]
fn test_parse_outgoing_content_accepts_marker_remote_url_without_extension() {
let input = "hello\n[IMAGE:https://multimedia.nt.qq.com.cn/download?appid=1406]\nbye";
let (text, images) = parse_outgoing_content(input);
assert_eq!(text, "hello\nbye");
assert_eq!(
images,
vec![OutgoingImageTarget::RemoteUrl(
"https://multimedia.nt.qq.com.cn/download?appid=1406".to_string()
)]
);
}
#[test]
fn test_parse_outgoing_content_keeps_non_remote_image_marker_as_text() {
let input = "[IMAGE:/tmp/a.png]\nhello";
@ -1088,6 +1299,38 @@ allowed_users = ["user1"]
assert!(images.is_empty());
}
#[test]
fn test_parse_outgoing_content_extracts_existing_local_path_lines() {
let temp = tempfile::tempdir().expect("temp dir");
let local_path = temp.path().join("capture.png");
std::fs::write(&local_path, b"png-bytes").expect("write local image");
let input = format!("done\n{}\nnext", local_path.display());
let (text, images) = parse_outgoing_content(&input);
assert_eq!(text, "done\nnext");
assert_eq!(
images,
vec![OutgoingImageTarget::LocalPath(
local_path.display().to_string()
)]
);
}
#[test]
fn test_parse_outgoing_content_extracts_data_uri_markers() {
let input = "hello\n[IMAGE:data:image/png;base64,aGVsbG8=]\nbye";
let (text, images) = parse_outgoing_content(input);
assert_eq!(text, "hello\nbye");
assert_eq!(
images,
vec![OutgoingImageTarget::DataUri(
"data:image/png;base64,aGVsbG8=".to_string()
)]
);
}
#[test]
fn test_build_text_message_body_with_passive_fields() {
let body = build_text_message_body("hello", Some("msg-123"), 2).expect("text body");

View File

@ -139,6 +139,7 @@ const SUPPORTED_PROXY_SERVICE_KEYS: &[&str] = &[
"tool.browser",
"tool.composio",
"tool.http_request",
"tool.multimodal",
"tool.pushover",
"memory.embeddings",
"tunnel.custom",

View File

@ -1068,9 +1068,16 @@ async fn prepare_gateway_messages_for_provider(
messages.push(ChatMessage::system(system_prompt));
messages.extend(user_messages);
let multimodal_config = state.config.lock().multimodal.clone();
let prepared =
crate::multimodal::prepare_messages_for_provider(&messages, &multimodal_config).await?;
let (multimodal_config, provider_hint) = {
let config = state.config.lock();
(config.multimodal.clone(), config.default_provider.clone())
};
let prepared = crate::multimodal::prepare_messages_for_provider_with_provider_hint(
&messages,
&multimodal_config,
provider_hint.as_deref(),
)
.await?;
Ok(prepared.messages)
}

View File

@ -8,6 +8,10 @@ use std::path::Path;
const IMAGE_MARKER_PREFIX: &str = "[IMAGE:";
const OPTIMIZED_IMAGE_MAX_DIMENSION: u32 = 512;
const OPTIMIZED_IMAGE_TARGET_BYTES: usize = 256 * 1024;
const REMOTE_FETCH_MULTIMODAL_SERVICE_KEY: &str = "tool.multimodal";
const REMOTE_FETCH_TOOL_SERVICE_KEY: &str = "tool.http_request";
const REMOTE_FETCH_QQ_SERVICE_KEY: &str = "channel.qq";
const REMOTE_FETCH_LEGACY_SERVICE_KEY: &str = "provider.ollama";
const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &[
"image/png",
"image/jpeg",
@ -118,6 +122,14 @@ pub fn extract_ollama_image_payload(image_ref: &str) -> Option<String> {
pub async fn prepare_messages_for_provider(
messages: &[ChatMessage],
config: &MultimodalConfig,
) -> anyhow::Result<PreparedMessages> {
prepare_messages_for_provider_with_provider_hint(messages, config, None).await
}
pub async fn prepare_messages_for_provider_with_provider_hint(
messages: &[ChatMessage],
config: &MultimodalConfig,
provider_hint: Option<&str>,
) -> anyhow::Result<PreparedMessages> {
let (max_images, max_image_size_mb) = config.effective_limits();
let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024);
@ -138,8 +150,6 @@ pub async fn prepare_messages_for_provider(
});
}
let remote_client = build_runtime_proxy_client_with_timeouts("provider.ollama", 30, 10);
let mut normalized_messages = Vec::with_capacity(messages.len());
for message in messages {
if message.role != "user" {
@ -156,7 +166,7 @@ pub async fn prepare_messages_for_provider(
let mut normalized_refs = Vec::with_capacity(refs.len());
for reference in refs {
let data_uri =
normalize_image_reference(&reference, config, max_bytes, &remote_client).await?;
normalize_image_reference(&reference, config, max_bytes, provider_hint).await?;
normalized_refs.push(data_uri);
}
@ -198,7 +208,7 @@ async fn normalize_image_reference(
source: &str,
config: &MultimodalConfig,
max_bytes: usize,
remote_client: &Client,
provider_hint: Option<&str>,
) -> anyhow::Result<String> {
if source.starts_with("data:") {
return normalize_data_uri(source, max_bytes).await;
@ -212,7 +222,7 @@ async fn normalize_image_reference(
.into());
}
return normalize_remote_image(source, max_bytes, remote_client).await;
return normalize_remote_image(source, max_bytes, provider_hint).await;
}
normalize_local_image(source, max_bytes).await
@ -266,24 +276,59 @@ async fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result<St
}
async fn normalize_remote_image(
source: &str,
max_bytes: usize,
provider_hint: Option<&str>,
) -> anyhow::Result<String> {
let service_keys = build_remote_fetch_service_keys(source, provider_hint);
let mut failures = Vec::new();
for service_key in service_keys {
let client = build_runtime_proxy_client_with_timeouts(&service_key, 30, 10);
match normalize_remote_image_once(source, max_bytes, &client).await {
Ok(normalized) => return Ok(normalized),
Err(error) => {
let reason = error.to_string();
tracing::debug!(
service_key = %service_key,
source = %source,
"multimodal remote fetch attempt failed: {reason}"
);
failures.push(format!("{service_key}: {reason}"));
}
}
}
Err(MultimodalError::RemoteFetchFailed {
input: source.to_string(),
reason: format!(
"{}; hint: when proxy.scope='services', include one of channel.qq/tool.multimodal/tool.http_request/provider.* as needed",
failures.join(" | ")
),
}
.into())
}
async fn normalize_remote_image_once(
source: &str,
max_bytes: usize,
remote_client: &Client,
) -> anyhow::Result<String> {
let response = remote_client.get(source).send().await.map_err(|error| {
MultimodalError::RemoteFetchFailed {
input: source.to_string(),
reason: error.to_string(),
}
})?;
let mut request = remote_client
.get(source)
.header(reqwest::header::USER_AGENT, "ZeroClaw/1.0");
if source_looks_like_qq_media(source) {
request = request.header(reqwest::header::REFERER, "https://qq.com/");
}
let response = request
.send()
.await
.map_err(|error| anyhow::anyhow!("error sending request for url ({source}): {error}"))?;
let status = response.status();
if !status.is_success() {
return Err(MultimodalError::RemoteFetchFailed {
input: source.to_string(),
reason: format!("HTTP {status}"),
}
.into());
anyhow::bail!("HTTP {status}");
}
if let Some(content_length) = response.content_length() {
@ -300,10 +345,7 @@ async fn normalize_remote_image(
let bytes = response
.bytes()
.await
.map_err(|error| MultimodalError::RemoteFetchFailed {
input: source.to_string(),
reason: error.to_string(),
})?;
.map_err(|error| anyhow::anyhow!("failed to read response body: {error}"))?;
validate_size(source, bytes.len(), max_bytes)?;
@ -325,6 +367,72 @@ async fn normalize_remote_image(
))
}
fn normalize_provider_service_key_hint(provider_hint: Option<&str>) -> Option<String> {
let raw = provider_hint
.map(str::trim)
.filter(|candidate| !candidate.is_empty())?
.split('#')
.next()
.unwrap_or_default()
.trim()
.to_ascii_lowercase();
if raw.is_empty() {
return None;
}
let candidate = if raw.starts_with("provider.") {
raw
} else {
format!("provider.{raw}")
};
if !candidate
.chars()
.all(|ch| ch.is_ascii_lowercase() || ch.is_ascii_digit() || matches!(ch, '.' | '_' | '-'))
{
return None;
}
Some(candidate)
}
fn source_looks_like_qq_media(source: &str) -> bool {
let Ok(parsed) = reqwest::Url::parse(source) else {
return false;
};
let Some(host) = parsed.host_str() else {
return false;
};
let host = host.to_ascii_lowercase();
host == "multimedia.nt.qq.com.cn" || host.ends_with(".qq.com.cn") || host.ends_with(".qq.com")
}
fn push_service_key_once(keys: &mut Vec<String>, key: String) {
if !key.trim().is_empty() && !keys.iter().any(|existing| existing == &key) {
keys.push(key);
}
}
fn build_remote_fetch_service_keys(source: &str, provider_hint: Option<&str>) -> Vec<String> {
let mut keys = Vec::new();
if source_looks_like_qq_media(source) {
push_service_key_once(&mut keys, REMOTE_FETCH_QQ_SERVICE_KEY.to_string());
}
if let Some(provider_service_key) = normalize_provider_service_key_hint(provider_hint) {
push_service_key_once(&mut keys, provider_service_key);
}
push_service_key_once(&mut keys, REMOTE_FETCH_MULTIMODAL_SERVICE_KEY.to_string());
push_service_key_once(&mut keys, REMOTE_FETCH_TOOL_SERVICE_KEY.to_string());
push_service_key_once(&mut keys, REMOTE_FETCH_LEGACY_SERVICE_KEY.to_string());
keys
}
async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result<String> {
let path = Path::new(source);
if !path.exists() || !path.is_file() {
@ -681,6 +789,63 @@ mod tests {
assert!(optimized_image.height() <= OPTIMIZED_IMAGE_MAX_DIMENSION);
}
#[test]
fn normalize_provider_service_key_hint_builds_provider_prefix() {
assert_eq!(
normalize_provider_service_key_hint(Some("openai")),
Some("provider.openai".to_string())
);
assert_eq!(
normalize_provider_service_key_hint(Some("provider.gemini")),
Some("provider.gemini".to_string())
);
assert_eq!(normalize_provider_service_key_hint(Some(" ")), None);
assert_eq!(normalize_provider_service_key_hint(None), None);
assert_eq!(
normalize_provider_service_key_hint(Some("openai#fast-route")),
Some("provider.openai".to_string())
);
assert_eq!(
normalize_provider_service_key_hint(Some("provider.gemini#img")),
Some("provider.gemini".to_string())
);
assert_eq!(
normalize_provider_service_key_hint(Some("custom:https://api.example.com/v1")),
None
);
}
#[test]
fn build_remote_fetch_service_keys_prefers_qq_channel_for_qq_media_hosts() {
let keys = build_remote_fetch_service_keys(
"https://multimedia.nt.qq.com.cn/download?appid=1406",
Some("openai"),
);
assert_eq!(
keys,
vec![
"channel.qq".to_string(),
"provider.openai".to_string(),
"tool.multimodal".to_string(),
"tool.http_request".to_string(),
"provider.ollama".to_string(),
]
);
}
#[test]
fn build_remote_fetch_service_keys_deduplicates_service_candidates() {
let keys = build_remote_fetch_service_keys("https://example.com/a.png", Some("ollama"));
assert_eq!(
keys,
vec![
"provider.ollama".to_string(),
"tool.multimodal".to_string(),
"tool.http_request".to_string(),
]
);
}
#[test]
fn extract_ollama_image_payload_supports_data_uris() {
let payload = extract_ollama_image_payload("data:image/png;base64,abcd==")

View File

@ -1098,7 +1098,12 @@ impl Provider for OpenAiCodexProvider {
// Normalize images: convert file paths to data URIs
let config = crate::config::MultimodalConfig::default();
let prepared = crate::multimodal::prepare_messages_for_provider(&messages, &config).await?;
let prepared = crate::multimodal::prepare_messages_for_provider_with_provider_hint(
&messages,
&config,
Some("openai"),
)
.await?;
let (instructions, input) = build_responses_input(&prepared.messages);
self.send_responses_request(input, instructions, model)
@ -1113,7 +1118,12 @@ impl Provider for OpenAiCodexProvider {
) -> anyhow::Result<String> {
// Normalize image markers: convert file paths to data URIs
let config = crate::config::MultimodalConfig::default();
let prepared = crate::multimodal::prepare_messages_for_provider(messages, &config).await?;
let prepared = crate::multimodal::prepare_messages_for_provider_with_provider_hint(
messages,
&config,
Some("openai"),
)
.await?;
let (instructions, input) = build_responses_input(&prepared.messages);
self.send_responses_request(input, instructions, model)