feat: add vision/multimodal support for Telegram + Bedrock

- channels/telegram.rs: support photo messages in parse_update_message;
  add resolve_photo_data_uri() to fetch, download and resize images to
  512px via Telegram getFile API before base64 encoding
- providers/bedrock.rs: add parse_user_content_blocks() to extract
  [IMAGE:data:...] markers and build proper Bedrock image content blocks;
  apply to both chat() and chat_with_system() paths; set vision: true
  in provider capabilities
- Cargo.toml: add image crate v0.25 (jpeg/png) for server-side resize
This commit is contained in:
Shawn Zhang 2026-02-20 16:24:43 +00:00 committed by Chummy
parent 645515145e
commit 7bf825eb34
4 changed files with 277 additions and 18 deletions

78
Cargo.lock generated
View File

@ -633,6 +633,12 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "byteorder-lite"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
[[package]]
name = "bytes"
version = "1.11.1"
@ -1795,6 +1801,15 @@ version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "fdeflate"
version = "0.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
dependencies = [
"simd-adler32",
]
[[package]]
name = "fiat-crypto"
version = "0.2.9"
@ -2628,6 +2643,21 @@ version = "3.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "365a784774bb381e8c19edb91190a90d7f2625e057b55de2bc0f6b57bc779ff2"
[[package]]
name = "image"
version = "0.25.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a"
dependencies = [
"bytemuck",
"byteorder-lite",
"moxcms",
"num-traits",
"png",
"zune-core",
"zune-jpeg",
]
[[package]]
name = "imap-proto"
version = "0.16.6"
@ -3557,6 +3587,16 @@ dependencies = [
"uuid",
]
[[package]]
name = "moxcms"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97"
dependencies = [
"num-traits",
"pxfm",
]
[[package]]
name = "multimap"
version = "0.10.1"
@ -4133,6 +4173,19 @@ dependencies = [
"plotters-backend",
]
[[package]]
name = "png"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
dependencies = [
"bitflags 2.11.0",
"crc32fast",
"fdeflate",
"flate2",
"miniz_oxide",
]
[[package]]
name = "polling"
version = "3.11.0"
@ -4495,6 +4548,15 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
[[package]]
name = "pxfm"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7186d3822593aa4393561d186d1393b3923e9d6163d3fbfd6e825e3e6cf3e6a8"
dependencies = [
"num-traits",
]
[[package]]
name = "quinn"
version = "0.11.9"
@ -7613,6 +7675,7 @@ dependencies = [
"hmac",
"hostname",
"http-body-util",
"image",
"landlock",
"lettre",
"libc",
@ -7815,3 +7878,18 @@ name = "zmij"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
[[package]]
name = "zune-core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
[[package]]
name = "zune-jpeg"
version = "0.5.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe"
dependencies = [
"zune-core",
]

View File

@ -51,6 +51,7 @@ prometheus = { version = "0.14", default-features = false }
# Base64 encoding (screenshots, image data)
base64 = "0.22"
image = { version = "0.25", default-features = false, features = ["jpeg", "png"] }
# URL encoding for web search
urlencoding = "2.1"

View File

@ -756,10 +756,29 @@ Allowlist Telegram username (without '@') or numeric user ID.",
}
}
fn parse_update_message(&self, update: &serde_json::Value) -> Option<ChannelMessage> {
fn parse_update_message(&self, update: &serde_json::Value) -> Option<(ChannelMessage, Option<String>)> {
let message = update.get("message")?;
let text = message.get("text").and_then(serde_json::Value::as_str)?;
// Support both text messages and photo messages (with optional caption)
let text_opt = message.get("text").and_then(serde_json::Value::as_str);
let caption_opt = message.get("caption").and_then(serde_json::Value::as_str);
// Extract file_id from photo (highest resolution = last element)
let photo_file_id = message.get("photo")
.and_then(serde_json::Value::as_array)
.and_then(|photos| photos.last())
.and_then(|p| p.get("file_id"))
.and_then(serde_json::Value::as_str)
.map(|s| s.to_string());
// Require at least text, caption, or photo
let text = match (text_opt, caption_opt, &photo_file_id) {
(Some(t), _, _) => t.to_string(),
(None, Some(c), Some(_)) => c.to_string(),
(None, Some(c), None) => c.to_string(),
(None, None, Some(_)) => String::new(), // will be filled with image marker later
(None, None, None) => return None,
};
let username = message
.get("from")
@ -793,7 +812,7 @@ Allowlist Telegram username (without '@') or numeric user ID.",
if self.mention_only && is_group {
let bot_username = self.bot_username.lock();
if let Some(ref bot_username) = *bot_username {
if !Self::contains_bot_mention(text, bot_username) {
if !Self::contains_bot_mention(&text, bot_username) {
return None;
}
} else {
@ -828,12 +847,12 @@ Allowlist Telegram username (without '@') or numeric user ID.",
let content = if self.mention_only && is_group {
let bot_username = self.bot_username.lock();
let bot_username = bot_username.as_ref()?;
Self::normalize_incoming_content(text, bot_username)?
Self::normalize_incoming_content(&text, bot_username)?
} else {
text.to_string()
};
Some(ChannelMessage {
Some((ChannelMessage {
id: format!("telegram_{chat_id}_{message_id}"),
sender: sender_identity,
reply_target,
@ -844,7 +863,52 @@ Allowlist Telegram username (without '@') or numeric user ID.",
.unwrap_or_default()
.as_secs(),
thread_ts: None,
})
}, photo_file_id))
}
/// Download a Telegram photo by file_id, resize to fit within 1024px, and return as base64 data URI.
async fn resolve_photo_data_uri(&self, file_id: &str) -> anyhow::Result<String> {
use base64::Engine as _;
// Step 1: call getFile to get file_path
let get_file_url = self.api_url(&format!("getFile?file_id={}", file_id));
let resp = self.http_client().get(&get_file_url).send().await?;
let json: serde_json::Value = resp.json().await?;
let file_path = json
.get("result")
.and_then(|r| r.get("file_path"))
.and_then(|p| p.as_str())
.ok_or_else(|| anyhow::anyhow!("getFile: no file_path in response"))?
.to_string();
// Step 2: download the actual file
let download_url = format!(
"https://api.telegram.org/file/bot{}/{}",
self.bot_token, file_path
);
let img_resp = self.http_client().get(&download_url).send().await?;
let bytes = img_resp.bytes().await?;
// Step 3: resize to max 1024px on longest side to fit within model context
let resized_bytes = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<u8>> {
let img = image::load_from_memory(&bytes)?;
let (w, h) = (img.width(), img.height());
let max_dim = 512u32;
let resized = if w > max_dim || h > max_dim {
img.thumbnail(max_dim, max_dim)
} else {
img
};
let mut buf = Vec::new();
resized.write_to(
&mut std::io::Cursor::new(&mut buf),
image::ImageFormat::Jpeg,
)?;
Ok(buf)
}).await??;
let b64 = base64::engine::general_purpose::STANDARD.encode(&resized_bytes);
Ok(format!("data:image/jpeg;base64,{}", b64))
}
async fn send_text_chunks(
@ -1794,10 +1858,23 @@ Ensure only one `zeroclaw` process is using this bot token."
offset = uid + 1;
}
let Some(msg) = self.parse_update_message(update) else {
let Some((mut msg, photo_file_id)) = self.parse_update_message(update) else {
self.handle_unauthorized_message(update).await;
continue;
};
// Resolve photo file_id to data URI and inject as IMAGE marker
if let Some(file_id) = photo_file_id {
if let Ok(data_uri) = self.resolve_photo_data_uri(&file_id).await {
let image_marker = format!("[IMAGE:{}]", data_uri);
if msg.content.is_empty() {
msg.content = image_marker;
} else {
msg.content = format!("{}\n{}", msg.content, image_marker);
}
}
}
// Send "typing" indicator immediately when we receive a message
let typing_body = serde_json::json!({
"chat_id": &msg.reply_target,
@ -2164,7 +2241,7 @@ mod tests {
});
let msg = ch
.parse_update_message(&update)
.parse_update_message(&update).map(|(m,_)|m)
.expect("message should parse");
assert_eq!(msg.sender, "alice");
@ -2191,7 +2268,7 @@ mod tests {
});
let msg = ch
.parse_update_message(&update)
.parse_update_message(&update).map(|(m,_)|m)
.expect("numeric allowlist should pass");
assert_eq!(msg.sender, "555");
@ -2218,7 +2295,7 @@ mod tests {
});
let msg = ch
.parse_update_message(&update)
.parse_update_message(&update).map(|(m,_)|m)
.expect("message with thread_id should parse");
assert_eq!(msg.sender, "alice");
@ -2831,7 +2908,7 @@ mod tests {
});
let parsed = ch
.parse_update_message(&update)
.parse_update_message(&update).map(|(m,_)|m)
.expect("mention should parse");
assert_eq!(parsed.content, "Hi status please");

View File

@ -190,6 +190,24 @@ enum ContentBlock {
ToolUse(ToolUseWrapper),
ToolResult(ToolResultWrapper),
CachePointBlock(CachePointWrapper),
Image(ImageWrapper),
}
#[derive(Debug, Serialize, Deserialize)]
struct ImageWrapper {
image: ImageBlock,
}
#[derive(Debug, Serialize, Deserialize)]
struct ImageBlock {
format: String,
source: ImageSource,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
struct ImageSource {
bytes: String,
}
#[derive(Debug, Serialize, Deserialize)]
@ -438,11 +456,10 @@ impl BedrockProvider {
}
}
_ => {
let content_blocks = Self::parse_user_content_blocks(&msg.content);
converse_messages.push(ConverseMessage {
role: "user".to_string(),
content: vec![ContentBlock::Text(TextBlock {
text: msg.content.clone(),
})],
content: content_blocks,
});
}
}
@ -456,6 +473,69 @@ impl BedrockProvider {
(system, converse_messages)
}
/// Parse user message content, extracting [IMAGE:data:...] markers into image blocks.
fn parse_user_content_blocks(content: &str) -> Vec<ContentBlock> {
let mut blocks: Vec<ContentBlock> = Vec::new();
let mut remaining = content;
let has_image = content.contains("[IMAGE:");
tracing::info!("parse_user_content_blocks called, len={}, has_image={}", content.len(), has_image);
while let Some(start) = remaining.find("[IMAGE:") {
// Add any text before the marker
let text_before = &remaining[..start];
if !text_before.trim().is_empty() {
blocks.push(ContentBlock::Text(TextBlock { text: text_before.to_string() }));
}
let after = &remaining[start + 7..]; // skip "[IMAGE:"
if let Some(end) = after.find(']') {
let src = &after[..end];
remaining = &after[end + 1..];
// Only handle data URIs (base64 encoded images)
if let Some(rest) = src.strip_prefix("data:") {
if let Some(semi) = rest.find(';') {
let mime = &rest[..semi];
let after_semi = &rest[semi + 1..];
if let Some(b64) = after_semi.strip_prefix("base64,") {
let format = match mime {
"image/jpeg" | "image/jpg" => "jpeg",
"image/png" => "png",
"image/gif" => "gif",
"image/webp" => "webp",
_ => "jpeg",
};
blocks.push(ContentBlock::Image(ImageWrapper {
image: ImageBlock {
format: format.to_string(),
source: ImageSource { bytes: b64.to_string() },
},
}));
continue;
}
}
}
// Non-data-uri image: just include as text reference
blocks.push(ContentBlock::Text(TextBlock { text: format!("[image: {}]", src) }));
} else {
// No closing bracket, treat rest as text
blocks.push(ContentBlock::Text(TextBlock { text: remaining.to_string() }));
break;
}
}
// Add any remaining text
if !remaining.trim().is_empty() {
blocks.push(ContentBlock::Text(TextBlock { text: remaining.to_string() }));
}
if blocks.is_empty() {
blocks.push(ContentBlock::Text(TextBlock { text: content.to_string() }));
}
blocks
}
/// Parse assistant message containing structured tool calls.
fn parse_assistant_tool_call_message(content: &str) -> Option<Vec<ContentBlock>> {
let value = serde_json::from_str::<serde_json::Value>(content).ok()?;
@ -584,6 +664,31 @@ impl BedrockProvider {
request_body: &ConverseRequest,
) -> anyhow::Result<ConverseResponse> {
let payload = serde_json::to_vec(request_body)?;
// Debug: log image blocks in payload (truncated)
if let Ok(debug_val) = serde_json::from_slice::<serde_json::Value>(&payload) {
if let Some(msgs) = debug_val.get("messages").and_then(|m| m.as_array()) {
for msg in msgs {
if let Some(content) = msg.get("content").and_then(|c| c.as_array()) {
for block in content {
if block.get("image").is_some() {
let mut b = block.clone();
if let Some(img) = b.get_mut("image") {
if let Some(src) = img.get_mut("source") {
if let Some(bytes) = src.get_mut("bytes") {
if let Some(s) = bytes.as_str() {
*bytes = serde_json::json!(format!("<base64 {} chars>", s.len()));
}
}
}
}
tracing::info!("Bedrock image block: {}", serde_json::to_string(&b).unwrap_or_default());
}
}
}
}
}
}
let url = Self::endpoint_url(&credentials.region, model);
let canonical_uri = Self::canonical_uri(model);
let now = chrono::Utc::now();
@ -639,7 +744,7 @@ impl Provider for BedrockProvider {
fn capabilities(&self) -> ProviderCapabilities {
ProviderCapabilities {
native_tool_calling: true,
vision: false,
vision: true,
}
}
@ -688,9 +793,7 @@ impl Provider for BedrockProvider {
system,
messages: vec![ConverseMessage {
role: "user".to_string(),
content: vec![ContentBlock::Text(TextBlock {
text: message.to_string(),
})],
content: Self::parse_user_content_blocks(message),
}],
inference_config: Some(InferenceConfig {
max_tokens: DEFAULT_MAX_TOKENS,