feat: add vision/multimodal support for Telegram + Bedrock
- channels/telegram.rs: support photo messages in parse_update_message; add resolve_photo_data_uri() to fetch, download and resize images to 512px via Telegram getFile API before base64 encoding - providers/bedrock.rs: add parse_user_content_blocks() to extract [IMAGE:data:...] markers and build proper Bedrock image content blocks; apply to both chat() and chat_with_system() paths; set vision: true in provider capabilities - Cargo.toml: add image crate v0.25 (jpeg/png) for server-side resize
This commit is contained in:
parent
645515145e
commit
7bf825eb34
78
Cargo.lock
generated
78
Cargo.lock
generated
@ -633,6 +633,12 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder-lite"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.11.1"
|
||||
@ -1795,6 +1801,15 @@ version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "fdeflate"
|
||||
version = "0.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c"
|
||||
dependencies = [
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fiat-crypto"
|
||||
version = "0.2.9"
|
||||
@ -2628,6 +2643,21 @@ version = "3.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "365a784774bb381e8c19edb91190a90d7f2625e057b55de2bc0f6b57bc779ff2"
|
||||
|
||||
[[package]]
|
||||
name = "image"
|
||||
version = "0.25.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder-lite",
|
||||
"moxcms",
|
||||
"num-traits",
|
||||
"png",
|
||||
"zune-core",
|
||||
"zune-jpeg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "imap-proto"
|
||||
version = "0.16.6"
|
||||
@ -3557,6 +3587,16 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "moxcms"
|
||||
version = "0.7.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"pxfm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "multimap"
|
||||
version = "0.10.1"
|
||||
@ -4133,6 +4173,19 @@ dependencies = [
|
||||
"plotters-backend",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "png"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"crc32fast",
|
||||
"fdeflate",
|
||||
"flate2",
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "polling"
|
||||
version = "3.11.0"
|
||||
@ -4495,6 +4548,15 @@ version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
|
||||
|
||||
[[package]]
|
||||
name = "pxfm"
|
||||
version = "0.1.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7186d3822593aa4393561d186d1393b3923e9d6163d3fbfd6e825e3e6cf3e6a8"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn"
|
||||
version = "0.11.9"
|
||||
@ -7613,6 +7675,7 @@ dependencies = [
|
||||
"hmac",
|
||||
"hostname",
|
||||
"http-body-util",
|
||||
"image",
|
||||
"landlock",
|
||||
"lettre",
|
||||
"libc",
|
||||
@ -7815,3 +7878,18 @@ name = "zmij"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
|
||||
|
||||
[[package]]
|
||||
name = "zune-core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9"
|
||||
|
||||
[[package]]
|
||||
name = "zune-jpeg"
|
||||
version = "0.5.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe"
|
||||
dependencies = [
|
||||
"zune-core",
|
||||
]
|
||||
|
||||
@ -51,6 +51,7 @@ prometheus = { version = "0.14", default-features = false }
|
||||
|
||||
# Base64 encoding (screenshots, image data)
|
||||
base64 = "0.22"
|
||||
image = { version = "0.25", default-features = false, features = ["jpeg", "png"] }
|
||||
|
||||
# URL encoding for web search
|
||||
urlencoding = "2.1"
|
||||
|
||||
@ -756,10 +756,29 @@ Allowlist Telegram username (without '@') or numeric user ID.",
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_update_message(&self, update: &serde_json::Value) -> Option<ChannelMessage> {
|
||||
fn parse_update_message(&self, update: &serde_json::Value) -> Option<(ChannelMessage, Option<String>)> {
|
||||
let message = update.get("message")?;
|
||||
|
||||
let text = message.get("text").and_then(serde_json::Value::as_str)?;
|
||||
// Support both text messages and photo messages (with optional caption)
|
||||
let text_opt = message.get("text").and_then(serde_json::Value::as_str);
|
||||
let caption_opt = message.get("caption").and_then(serde_json::Value::as_str);
|
||||
|
||||
// Extract file_id from photo (highest resolution = last element)
|
||||
let photo_file_id = message.get("photo")
|
||||
.and_then(serde_json::Value::as_array)
|
||||
.and_then(|photos| photos.last())
|
||||
.and_then(|p| p.get("file_id"))
|
||||
.and_then(serde_json::Value::as_str)
|
||||
.map(|s| s.to_string());
|
||||
|
||||
// Require at least text, caption, or photo
|
||||
let text = match (text_opt, caption_opt, &photo_file_id) {
|
||||
(Some(t), _, _) => t.to_string(),
|
||||
(None, Some(c), Some(_)) => c.to_string(),
|
||||
(None, Some(c), None) => c.to_string(),
|
||||
(None, None, Some(_)) => String::new(), // will be filled with image marker later
|
||||
(None, None, None) => return None,
|
||||
};
|
||||
|
||||
let username = message
|
||||
.get("from")
|
||||
@ -793,7 +812,7 @@ Allowlist Telegram username (without '@') or numeric user ID.",
|
||||
if self.mention_only && is_group {
|
||||
let bot_username = self.bot_username.lock();
|
||||
if let Some(ref bot_username) = *bot_username {
|
||||
if !Self::contains_bot_mention(text, bot_username) {
|
||||
if !Self::contains_bot_mention(&text, bot_username) {
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
@ -828,12 +847,12 @@ Allowlist Telegram username (without '@') or numeric user ID.",
|
||||
let content = if self.mention_only && is_group {
|
||||
let bot_username = self.bot_username.lock();
|
||||
let bot_username = bot_username.as_ref()?;
|
||||
Self::normalize_incoming_content(text, bot_username)?
|
||||
Self::normalize_incoming_content(&text, bot_username)?
|
||||
} else {
|
||||
text.to_string()
|
||||
};
|
||||
|
||||
Some(ChannelMessage {
|
||||
Some((ChannelMessage {
|
||||
id: format!("telegram_{chat_id}_{message_id}"),
|
||||
sender: sender_identity,
|
||||
reply_target,
|
||||
@ -844,7 +863,52 @@ Allowlist Telegram username (without '@') or numeric user ID.",
|
||||
.unwrap_or_default()
|
||||
.as_secs(),
|
||||
thread_ts: None,
|
||||
})
|
||||
}, photo_file_id))
|
||||
}
|
||||
|
||||
/// Download a Telegram photo by file_id, resize to fit within 1024px, and return as base64 data URI.
|
||||
async fn resolve_photo_data_uri(&self, file_id: &str) -> anyhow::Result<String> {
|
||||
use base64::Engine as _;
|
||||
|
||||
// Step 1: call getFile to get file_path
|
||||
let get_file_url = self.api_url(&format!("getFile?file_id={}", file_id));
|
||||
let resp = self.http_client().get(&get_file_url).send().await?;
|
||||
let json: serde_json::Value = resp.json().await?;
|
||||
let file_path = json
|
||||
.get("result")
|
||||
.and_then(|r| r.get("file_path"))
|
||||
.and_then(|p| p.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("getFile: no file_path in response"))?
|
||||
.to_string();
|
||||
|
||||
// Step 2: download the actual file
|
||||
let download_url = format!(
|
||||
"https://api.telegram.org/file/bot{}/{}",
|
||||
self.bot_token, file_path
|
||||
);
|
||||
let img_resp = self.http_client().get(&download_url).send().await?;
|
||||
let bytes = img_resp.bytes().await?;
|
||||
|
||||
// Step 3: resize to max 1024px on longest side to fit within model context
|
||||
let resized_bytes = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<u8>> {
|
||||
let img = image::load_from_memory(&bytes)?;
|
||||
let (w, h) = (img.width(), img.height());
|
||||
let max_dim = 512u32;
|
||||
let resized = if w > max_dim || h > max_dim {
|
||||
img.thumbnail(max_dim, max_dim)
|
||||
} else {
|
||||
img
|
||||
};
|
||||
let mut buf = Vec::new();
|
||||
resized.write_to(
|
||||
&mut std::io::Cursor::new(&mut buf),
|
||||
image::ImageFormat::Jpeg,
|
||||
)?;
|
||||
Ok(buf)
|
||||
}).await??;
|
||||
|
||||
let b64 = base64::engine::general_purpose::STANDARD.encode(&resized_bytes);
|
||||
Ok(format!("data:image/jpeg;base64,{}", b64))
|
||||
}
|
||||
|
||||
async fn send_text_chunks(
|
||||
@ -1794,10 +1858,23 @@ Ensure only one `zeroclaw` process is using this bot token."
|
||||
offset = uid + 1;
|
||||
}
|
||||
|
||||
let Some(msg) = self.parse_update_message(update) else {
|
||||
let Some((mut msg, photo_file_id)) = self.parse_update_message(update) else {
|
||||
self.handle_unauthorized_message(update).await;
|
||||
continue;
|
||||
};
|
||||
|
||||
// Resolve photo file_id to data URI and inject as IMAGE marker
|
||||
if let Some(file_id) = photo_file_id {
|
||||
if let Ok(data_uri) = self.resolve_photo_data_uri(&file_id).await {
|
||||
let image_marker = format!("[IMAGE:{}]", data_uri);
|
||||
if msg.content.is_empty() {
|
||||
msg.content = image_marker;
|
||||
} else {
|
||||
msg.content = format!("{}\n{}", msg.content, image_marker);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Send "typing" indicator immediately when we receive a message
|
||||
let typing_body = serde_json::json!({
|
||||
"chat_id": &msg.reply_target,
|
||||
@ -2164,7 +2241,7 @@ mod tests {
|
||||
});
|
||||
|
||||
let msg = ch
|
||||
.parse_update_message(&update)
|
||||
.parse_update_message(&update).map(|(m,_)|m)
|
||||
.expect("message should parse");
|
||||
|
||||
assert_eq!(msg.sender, "alice");
|
||||
@ -2191,7 +2268,7 @@ mod tests {
|
||||
});
|
||||
|
||||
let msg = ch
|
||||
.parse_update_message(&update)
|
||||
.parse_update_message(&update).map(|(m,_)|m)
|
||||
.expect("numeric allowlist should pass");
|
||||
|
||||
assert_eq!(msg.sender, "555");
|
||||
@ -2218,7 +2295,7 @@ mod tests {
|
||||
});
|
||||
|
||||
let msg = ch
|
||||
.parse_update_message(&update)
|
||||
.parse_update_message(&update).map(|(m,_)|m)
|
||||
.expect("message with thread_id should parse");
|
||||
|
||||
assert_eq!(msg.sender, "alice");
|
||||
@ -2831,7 +2908,7 @@ mod tests {
|
||||
});
|
||||
|
||||
let parsed = ch
|
||||
.parse_update_message(&update)
|
||||
.parse_update_message(&update).map(|(m,_)|m)
|
||||
.expect("mention should parse");
|
||||
assert_eq!(parsed.content, "Hi status please");
|
||||
|
||||
|
||||
@ -190,6 +190,24 @@ enum ContentBlock {
|
||||
ToolUse(ToolUseWrapper),
|
||||
ToolResult(ToolResultWrapper),
|
||||
CachePointBlock(CachePointWrapper),
|
||||
Image(ImageWrapper),
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct ImageWrapper {
|
||||
image: ImageBlock,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct ImageBlock {
|
||||
format: String,
|
||||
source: ImageSource,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
struct ImageSource {
|
||||
bytes: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@ -438,11 +456,10 @@ impl BedrockProvider {
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let content_blocks = Self::parse_user_content_blocks(&msg.content);
|
||||
converse_messages.push(ConverseMessage {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text(TextBlock {
|
||||
text: msg.content.clone(),
|
||||
})],
|
||||
content: content_blocks,
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -456,6 +473,69 @@ impl BedrockProvider {
|
||||
(system, converse_messages)
|
||||
}
|
||||
|
||||
/// Parse user message content, extracting [IMAGE:data:...] markers into image blocks.
|
||||
fn parse_user_content_blocks(content: &str) -> Vec<ContentBlock> {
|
||||
let mut blocks: Vec<ContentBlock> = Vec::new();
|
||||
let mut remaining = content;
|
||||
let has_image = content.contains("[IMAGE:");
|
||||
tracing::info!("parse_user_content_blocks called, len={}, has_image={}", content.len(), has_image);
|
||||
|
||||
while let Some(start) = remaining.find("[IMAGE:") {
|
||||
// Add any text before the marker
|
||||
let text_before = &remaining[..start];
|
||||
if !text_before.trim().is_empty() {
|
||||
blocks.push(ContentBlock::Text(TextBlock { text: text_before.to_string() }));
|
||||
}
|
||||
|
||||
let after = &remaining[start + 7..]; // skip "[IMAGE:"
|
||||
if let Some(end) = after.find(']') {
|
||||
let src = &after[..end];
|
||||
remaining = &after[end + 1..];
|
||||
|
||||
// Only handle data URIs (base64 encoded images)
|
||||
if let Some(rest) = src.strip_prefix("data:") {
|
||||
if let Some(semi) = rest.find(';') {
|
||||
let mime = &rest[..semi];
|
||||
let after_semi = &rest[semi + 1..];
|
||||
if let Some(b64) = after_semi.strip_prefix("base64,") {
|
||||
let format = match mime {
|
||||
"image/jpeg" | "image/jpg" => "jpeg",
|
||||
"image/png" => "png",
|
||||
"image/gif" => "gif",
|
||||
"image/webp" => "webp",
|
||||
_ => "jpeg",
|
||||
};
|
||||
blocks.push(ContentBlock::Image(ImageWrapper {
|
||||
image: ImageBlock {
|
||||
format: format.to_string(),
|
||||
source: ImageSource { bytes: b64.to_string() },
|
||||
},
|
||||
}));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Non-data-uri image: just include as text reference
|
||||
blocks.push(ContentBlock::Text(TextBlock { text: format!("[image: {}]", src) }));
|
||||
} else {
|
||||
// No closing bracket, treat rest as text
|
||||
blocks.push(ContentBlock::Text(TextBlock { text: remaining.to_string() }));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Add any remaining text
|
||||
if !remaining.trim().is_empty() {
|
||||
blocks.push(ContentBlock::Text(TextBlock { text: remaining.to_string() }));
|
||||
}
|
||||
|
||||
if blocks.is_empty() {
|
||||
blocks.push(ContentBlock::Text(TextBlock { text: content.to_string() }));
|
||||
}
|
||||
|
||||
blocks
|
||||
}
|
||||
|
||||
/// Parse assistant message containing structured tool calls.
|
||||
fn parse_assistant_tool_call_message(content: &str) -> Option<Vec<ContentBlock>> {
|
||||
let value = serde_json::from_str::<serde_json::Value>(content).ok()?;
|
||||
@ -584,6 +664,31 @@ impl BedrockProvider {
|
||||
request_body: &ConverseRequest,
|
||||
) -> anyhow::Result<ConverseResponse> {
|
||||
let payload = serde_json::to_vec(request_body)?;
|
||||
|
||||
// Debug: log image blocks in payload (truncated)
|
||||
if let Ok(debug_val) = serde_json::from_slice::<serde_json::Value>(&payload) {
|
||||
if let Some(msgs) = debug_val.get("messages").and_then(|m| m.as_array()) {
|
||||
for msg in msgs {
|
||||
if let Some(content) = msg.get("content").and_then(|c| c.as_array()) {
|
||||
for block in content {
|
||||
if block.get("image").is_some() {
|
||||
let mut b = block.clone();
|
||||
if let Some(img) = b.get_mut("image") {
|
||||
if let Some(src) = img.get_mut("source") {
|
||||
if let Some(bytes) = src.get_mut("bytes") {
|
||||
if let Some(s) = bytes.as_str() {
|
||||
*bytes = serde_json::json!(format!("<base64 {} chars>", s.len()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tracing::info!("Bedrock image block: {}", serde_json::to_string(&b).unwrap_or_default());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let url = Self::endpoint_url(&credentials.region, model);
|
||||
let canonical_uri = Self::canonical_uri(model);
|
||||
let now = chrono::Utc::now();
|
||||
@ -639,7 +744,7 @@ impl Provider for BedrockProvider {
|
||||
fn capabilities(&self) -> ProviderCapabilities {
|
||||
ProviderCapabilities {
|
||||
native_tool_calling: true,
|
||||
vision: false,
|
||||
vision: true,
|
||||
}
|
||||
}
|
||||
|
||||
@ -688,9 +793,7 @@ impl Provider for BedrockProvider {
|
||||
system,
|
||||
messages: vec![ConverseMessage {
|
||||
role: "user".to_string(),
|
||||
content: vec![ContentBlock::Text(TextBlock {
|
||||
text: message.to_string(),
|
||||
})],
|
||||
content: Self::parse_user_content_blocks(message),
|
||||
}],
|
||||
inference_config: Some(InferenceConfig {
|
||||
max_tokens: DEFAULT_MAX_TOKENS,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user