From 9b9faab991207751c595025f8d249d55f91e6f50 Mon Sep 17 00:00:00 2001 From: simianastronaut Date: Thu, 12 Mar 2026 11:16:14 -0400 Subject: [PATCH] style(memory,tools): fix rustfmt violations in memory and xlsx modules (#2442) Apply rustfmt canonical formatting to src/memory/traits.rs, src/memory/sqlite.rs, and src/tools/xlsx_read.rs. Also fix strict clippy violations (format_push_string, cast_possible_truncation) in xlsx_read test helpers to satisfy the quality gate. Includes prerequisite code for the reindex command and xlsx_read tool that introduced the formatting violations, along with the zip and quick-xml dependencies required by xlsx_read. Closes #2442 Co-authored-by: Preventnetworkhacking Co-authored-by: Argenis Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 43 ++ Cargo.toml | 4 + src/lib.rs | 9 + src/main.rs | 9 + src/memory/cli.rs | 72 +++ src/memory/sqlite.rs | 57 ++ src/memory/traits.rs | 13 + src/tools/mod.rs | 5 + src/tools/xlsx_read.rs | 1180 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 1392 insertions(+) create mode 100644 src/tools/xlsx_read.rs diff --git a/Cargo.lock b/Cargo.lock index dc5cee085..809915ca4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4753,6 +4753,15 @@ dependencies = [ "image", ] +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + [[package]] name = "quinn" version = "0.11.9" @@ -6686,6 +6695,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "typed-path" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e" + [[package]] name = "typenum" version = "1.19.0" @@ -7969,6 +7984,7 @@ dependencies = [ "prometheus", "prost 0.14.3", "qrcode", + "quick-xml", "rand 0.10.0", "regex", "reqwest", @@ -8010,6 +8026,7 @@ dependencies = [ "webpki-roots 1.0.6", "which", "wiremock", + "zip", ] [[package]] @@ -8149,6 +8166,20 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "zip" +version = "8.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b680f2a0cd479b4cff6e1233c483fdead418106eae419dc60200ae9850f6d004" +dependencies = [ + "crc32fast", + "flate2", + "indexmap", + "memchr", + "typed-path", + "zopfli", +] + [[package]] name = "zlib-rs" version = "0.6.3" @@ -8161,6 +8192,18 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + [[package]] name = "zune-core" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index 028a6d05a..0173fba2b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -165,6 +165,10 @@ probe-rs = { version = "0.31", optional = true } # PDF extraction for datasheet RAG (optional, enable with --features rag-pdf) pdf-extract = { version = "0.10", optional = true } +# XLSX extraction (zip archive reading + XML parsing) +zip = { version = "8.1", default-features = false, features = ["deflate"] } +quick-xml = "0.37" + # Terminal QR rendering for WhatsApp Web pairing flow. qrcode = { version = "0.14", optional = true } diff --git a/src/lib.rs b/src/lib.rs index ace154e6f..828070e00 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -393,6 +393,15 @@ pub enum MemoryCommands { #[arg(long)] yes: bool, }, + /// Rebuild embeddings for all memories (use after changing embedding model) + Reindex { + /// Skip confirmation prompt + #[arg(long)] + yes: bool, + /// Show progress during reindex + #[arg(long, default_value = "true")] + progress: bool, + }, } /// Integration subcommands diff --git a/src/main.rs b/src/main.rs index 80a74f8e5..d650a1681 100644 --- a/src/main.rs +++ b/src/main.rs @@ -649,6 +649,15 @@ enum MemoryCommands { #[arg(long)] yes: bool, }, + /// Rebuild embeddings for all memories (use after changing embedding model) + Reindex { + /// Skip confirmation prompt + #[arg(long)] + yes: bool, + /// Show progress during reindex + #[arg(long, default_value = "true")] + progress: bool, + }, } #[tokio::main] diff --git a/src/memory/cli.rs b/src/memory/cli.rs index 6feff1d99..56b972468 100644 --- a/src/memory/cli.rs +++ b/src/memory/cli.rs @@ -23,6 +23,9 @@ pub async fn handle_command(command: crate::MemoryCommands, config: &Config) -> crate::MemoryCommands::Clear { key, category, yes } => { handle_clear(config, key, category, yes).await } + crate::MemoryCommands::Reindex { yes, progress } => { + handle_reindex(config, yes, progress).await + } } } @@ -297,6 +300,75 @@ async fn handle_clear_key(mem: &dyn Memory, key: &str, yes: bool) -> Result<()> Ok(()) } +/// Rebuild embeddings for all memories using current embedding configuration. +async fn handle_reindex(config: &Config, yes: bool, progress: bool) -> Result<()> { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + + // Reindex requires full memory backend with embeddings + let mem = super::create_memory(&config.memory, &config.workspace_dir, None)?; + + // Get total count for confirmation + let total = mem.count().await?; + + if total == 0 { + println!("No memories to reindex."); + return Ok(()); + } + + println!( + "\n{} Found {} memories to reindex.", + style("ℹ").blue().bold(), + style(total).cyan().bold() + ); + println!( + " This will clear the embedding cache and recompute all embeddings\n using the current embedding provider configuration.\n" + ); + + if !yes { + let confirmed = dialoguer::Confirm::new() + .with_prompt(" Proceed with reindex?") + .default(false) + .interact()?; + if !confirmed { + println!("Aborted."); + return Ok(()); + } + } + + println!("\n{} Reindexing memories...\n", style("⟳").yellow().bold()); + + // Create progress callback if enabled + let callback: Option> = if progress { + let last_percent = Arc::new(AtomicUsize::new(0)); + Some(Box::new(move |current, total| { + let percent = (current * 100) / total.max(1); + let last = last_percent.load(Ordering::Relaxed); + // Only print every 10% + if percent >= last + 10 || current == total { + last_percent.store(percent, Ordering::Relaxed); + eprint!("\r Progress: {current}/{total} ({percent}%)"); + if current == total { + eprintln!(); + } + } + })) + } else { + None + }; + + // Perform reindex + let reindexed = mem.reindex(callback).await?; + + println!( + "\n{} Reindexed {} memories successfully.", + style("✓").green().bold(), + style(reindexed).cyan().bold() + ); + + Ok(()) +} + fn parse_category(s: &str) -> MemoryCategory { match s.trim().to_ascii_lowercase().as_str() { "core" => MemoryCategory::Core, diff --git a/src/memory/sqlite.rs b/src/memory/sqlite.rs index 3e90ec6dc..90cb2a53b 100644 --- a/src/memory/sqlite.rs +++ b/src/memory/sqlite.rs @@ -779,6 +779,63 @@ impl Memory for SqliteMemory { .await .unwrap_or(false) } + + async fn reindex( + &self, + progress_callback: Option>, + ) -> anyhow::Result { + // Step 1: Get all memory entries + let entries = self.list(None, None).await?; + let total = entries.len(); + + if total == 0 { + return Ok(0); + } + + // Step 2: Clear embedding cache + { + let conn = self.conn.clone(); + tokio::task::spawn_blocking(move || -> anyhow::Result<()> { + let conn = conn.lock(); + conn.execute("DELETE FROM embedding_cache", [])?; + Ok(()) + }) + .await??; + } + + // Step 3: Recompute embeddings for each memory + let mut reindexed = 0; + for (idx, entry) in entries.iter().enumerate() { + // Compute new embedding + let embedding = self.get_or_compute_embedding(&entry.content).await?; + + if let Some(ref emb) = embedding { + // Update the embedding in the memories table + let conn = self.conn.clone(); + let entry_id = entry.id.clone(); + let emb_bytes = vector::vec_to_bytes(emb); + + tokio::task::spawn_blocking(move || -> anyhow::Result<()> { + let conn = conn.lock(); + conn.execute( + "UPDATE memories SET embedding = ?1 WHERE id = ?2", + params![emb_bytes, entry_id], + )?; + Ok(()) + }) + .await??; + + reindexed += 1; + } + + // Report progress + if let Some(ref cb) = progress_callback { + cb(idx + 1, total); + } + } + + Ok(reindexed) + } } #[cfg(test)] diff --git a/src/memory/traits.rs b/src/memory/traits.rs index de72923d3..f6b2030b8 100644 --- a/src/memory/traits.rs +++ b/src/memory/traits.rs @@ -92,6 +92,19 @@ pub trait Memory: Send + Sync { /// Health check async fn health_check(&self) -> bool; + + /// Rebuild embeddings for all memories using the current embedding provider. + /// Returns the number of memories reindexed, or an error if not supported. + /// + /// Use this after changing the embedding model to ensure vector search + /// works correctly with the new embeddings. + async fn reindex( + &self, + progress_callback: Option>, + ) -> anyhow::Result { + let _ = progress_callback; + anyhow::bail!("Reindex not supported by {} backend", self.name()) + } } #[cfg(test)] diff --git a/src/tools/mod.rs b/src/tools/mod.rs index d1f7ad4d5..a820af520 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -54,6 +54,7 @@ pub mod shell; pub mod traits; pub mod web_fetch; pub mod web_search_tool; +pub mod xlsx_read; pub use browser::{BrowserTool, ComputerUseConfig}; pub use browser_open::BrowserOpenTool; @@ -96,6 +97,7 @@ pub use traits::Tool; pub use traits::{ToolResult, ToolSpec}; pub use web_fetch::WebFetchTool; pub use web_search_tool::WebSearchTool; +pub use xlsx_read::XlsxReadTool; use crate::config::{Config, DelegateAgentConfig}; use crate::memory::Memory; @@ -302,6 +304,9 @@ pub fn all_tools_with_runtime( // PDF extraction (feature-gated at compile time via rag-pdf) tool_arcs.push(Arc::new(PdfReadTool::new(security.clone()))); + // XLSX text extraction + tool_arcs.push(Arc::new(XlsxReadTool::new(security.clone()))); + // Vision tools are always available tool_arcs.push(Arc::new(ScreenshotTool::new(security.clone()))); tool_arcs.push(Arc::new(ImageInfoTool::new(security.clone()))); diff --git a/src/tools/xlsx_read.rs b/src/tools/xlsx_read.rs new file mode 100644 index 000000000..673549cf3 --- /dev/null +++ b/src/tools/xlsx_read.rs @@ -0,0 +1,1180 @@ +use super::traits::{Tool, ToolResult}; +use crate::security::SecurityPolicy; +use async_trait::async_trait; +use serde_json::json; +use std::collections::HashMap; +use std::path::{Component, Path}; +use std::sync::Arc; + +/// Maximum XLSX file size (50 MB). +const MAX_XLSX_BYTES: u64 = 50 * 1024 * 1024; +/// Default character limit returned to the LLM. +const DEFAULT_MAX_CHARS: usize = 50_000; +/// Hard ceiling regardless of what the caller requests. +const MAX_OUTPUT_CHARS: usize = 200_000; +/// Upper bound for total uncompressed XML read from sheet files. +const MAX_TOTAL_SHEET_XML_BYTES: u64 = 16 * 1024 * 1024; + +/// Extract plain text from an XLSX file in the workspace. +pub struct XlsxReadTool { + security: Arc, +} + +impl XlsxReadTool { + pub fn new(security: Arc) -> Self { + Self { security } + } +} + +/// Extract plain text from XLSX bytes. +/// +/// XLSX is a ZIP archive containing `xl/worksheets/sheet*.xml` with cell data, +/// `xl/sharedStrings.xml` with a string pool, and `xl/workbook.xml` with sheet +/// names. Text cells reference the shared string pool by index; inline and +/// numeric values are taken directly from `` elements. +fn extract_xlsx_text(bytes: &[u8]) -> anyhow::Result { + extract_xlsx_text_with_limits(bytes, MAX_TOTAL_SHEET_XML_BYTES) +} + +fn extract_xlsx_text_with_limits( + bytes: &[u8], + max_total_sheet_xml_bytes: u64, +) -> anyhow::Result { + use std::io::Read; + + let cursor = std::io::Cursor::new(bytes); + let mut archive = zip::ZipArchive::new(cursor)?; + + // 1. Parse shared strings table. + let shared_strings = parse_shared_strings(&mut archive)?; + + // 2. Parse workbook.xml to get sheet names and rIds. + let sheet_entries = parse_workbook_sheets(&mut archive)?; + + // 3. Parse workbook.xml.rels to map rId → Target path. + let rel_targets = parse_workbook_rels(&mut archive)?; + + // 4. Build ordered list of (sheet_name, file_path) pairs. + let mut ordered_sheets: Vec<(String, String)> = Vec::new(); + for (sheet_name, r_id) in &sheet_entries { + if let Some(target) = rel_targets.get(r_id) { + if let Some(normalized) = normalize_sheet_target(target) { + ordered_sheets.push((sheet_name.clone(), normalized)); + } + } + } + + // Fallback: if workbook parsing yielded no sheets, scan ZIP entries directly. + if ordered_sheets.is_empty() { + let mut fallback_paths: Vec = (0..archive.len()) + .filter_map(|i| { + let name = archive.by_index(i).ok()?.name().to_string(); + if name.starts_with("xl/worksheets/sheet") && name.ends_with(".xml") { + Some(name) + } else { + None + } + }) + .collect(); + fallback_paths.sort_by(|a, b| { + let a_idx = sheet_numeric_index(a); + let b_idx = sheet_numeric_index(b); + a_idx.cmp(&b_idx).then_with(|| a.cmp(b)) + }); + + if fallback_paths.is_empty() { + anyhow::bail!("Not a valid XLSX (no worksheet XML files found)"); + } + + for (i, path) in fallback_paths.into_iter().enumerate() { + ordered_sheets.push((format!("Sheet{}", i + 1), path)); + } + } + + // 5. Extract cell text from each sheet. + let mut output = String::new(); + let mut total_sheet_xml_bytes = 0u64; + let multi_sheet = ordered_sheets.len() > 1; + + for (sheet_name, sheet_path) in &ordered_sheets { + let mut sheet_file = match archive.by_name(sheet_path) { + Ok(f) => f, + Err(_) => continue, + }; + + let sheet_xml_size = sheet_file.size(); + total_sheet_xml_bytes = total_sheet_xml_bytes + .checked_add(sheet_xml_size) + .ok_or_else(|| anyhow::anyhow!("Sheet XML payload size overflow"))?; + if total_sheet_xml_bytes > max_total_sheet_xml_bytes { + anyhow::bail!( + "Sheet XML payload too large: {} bytes (limit: {} bytes)", + total_sheet_xml_bytes, + max_total_sheet_xml_bytes + ); + } + + let mut xml_content = String::new(); + sheet_file.read_to_string(&mut xml_content)?; + + if multi_sheet { + if !output.is_empty() { + output.push('\n'); + } + use std::fmt::Write as _; + let _ = writeln!(output, "--- Sheet: {} ---", sheet_name); + } + + let sheet_text = extract_sheet_cells(&xml_content, &shared_strings)?; + output.push_str(&sheet_text); + } + + Ok(output) +} + +/// Parse `xl/sharedStrings.xml` into a `Vec` indexed by position. +fn parse_shared_strings( + archive: &mut zip::ZipArchive, +) -> anyhow::Result> { + use quick_xml::events::Event; + use quick_xml::Reader; + use std::io::Read; + + let mut xml = String::new(); + match archive.by_name("xl/sharedStrings.xml") { + Ok(mut f) => { + f.read_to_string(&mut xml)?; + } + Err(zip::result::ZipError::FileNotFound) => return Ok(Vec::new()), + Err(e) => return Err(e.into()), + } + + let mut strings = Vec::new(); + let mut reader = Reader::from_str(&xml); + let mut in_si = false; + let mut in_t = false; + let mut current = String::new(); + + loop { + match reader.read_event() { + Ok(Event::Start(e)) => { + let qname = e.name(); + let name = local_name(qname.as_ref()); + if name == b"si" { + in_si = true; + current.clear(); + } else if in_si && name == b"t" { + in_t = true; + } + } + Ok(Event::End(e)) => { + let qname = e.name(); + let name = local_name(qname.as_ref()); + if name == b"t" { + in_t = false; + } else if name == b"si" { + in_si = false; + strings.push(std::mem::take(&mut current)); + } + } + Ok(Event::Text(e)) => { + if in_t { + current.push_str(&e.unescape()?); + } + } + Ok(Event::Eof) => break, + Err(e) => return Err(e.into()), + _ => {} + } + } + + Ok(strings) +} + +/// Parse `xl/workbook.xml` → Vec<(sheet_name, rId)>. +fn parse_workbook_sheets( + archive: &mut zip::ZipArchive, +) -> anyhow::Result> { + use quick_xml::events::Event; + use quick_xml::Reader; + use std::io::Read; + + let mut xml = String::new(); + match archive.by_name("xl/workbook.xml") { + Ok(mut f) => { + f.read_to_string(&mut xml)?; + } + Err(zip::result::ZipError::FileNotFound) => return Ok(Vec::new()), + Err(e) => return Err(e.into()), + } + + let mut sheets = Vec::new(); + let mut reader = Reader::from_str(&xml); + + loop { + match reader.read_event() { + Ok(Event::Start(ref e) | Event::Empty(ref e)) => { + let qname = e.name(); + if local_name(qname.as_ref()) == b"sheet" { + let mut name = None; + let mut r_id = None; + for attr in e.attributes().flatten() { + let key = attr.key.as_ref(); + let local = local_name(key); + if local == b"name" { + name = Some( + attr.decode_and_unescape_value(reader.decoder())? + .into_owned(), + ); + } else if key == b"r:id" || local == b"id" { + // Accept both r:id and {ns}:id variants. + // Only take the relationship id (starts with "rId"). + let val = attr + .decode_and_unescape_value(reader.decoder())? + .into_owned(); + if val.starts_with("rId") { + r_id = Some(val); + } + } + } + if let (Some(n), Some(r)) = (name, r_id) { + sheets.push((n, r)); + } + } + } + Ok(Event::Eof) => break, + Err(e) => return Err(e.into()), + _ => {} + } + } + + Ok(sheets) +} + +/// Parse `xl/_rels/workbook.xml.rels` → HashMap. +fn parse_workbook_rels( + archive: &mut zip::ZipArchive, +) -> anyhow::Result> { + use quick_xml::events::Event; + use quick_xml::Reader; + use std::io::Read; + + let mut xml = String::new(); + match archive.by_name("xl/_rels/workbook.xml.rels") { + Ok(mut f) => { + f.read_to_string(&mut xml)?; + } + Err(zip::result::ZipError::FileNotFound) => return Ok(HashMap::new()), + Err(e) => return Err(e.into()), + } + + let mut rels = HashMap::new(); + let mut reader = Reader::from_str(&xml); + + loop { + match reader.read_event() { + Ok(Event::Start(ref e) | Event::Empty(ref e)) => { + let qname = e.name(); + if local_name(qname.as_ref()) == b"Relationship" { + let mut rel_id = None; + let mut target = None; + for attr in e.attributes().flatten() { + let key = local_name(attr.key.as_ref()); + if key.eq_ignore_ascii_case(b"id") { + rel_id = Some( + attr.decode_and_unescape_value(reader.decoder())? + .into_owned(), + ); + } else if key.eq_ignore_ascii_case(b"target") { + target = Some( + attr.decode_and_unescape_value(reader.decoder())? + .into_owned(), + ); + } + } + if let (Some(id), Some(t)) = (rel_id, target) { + rels.insert(id, t); + } + } + } + Ok(Event::Eof) => break, + Err(e) => return Err(e.into()), + _ => {} + } + } + + Ok(rels) +} + +/// Extract cell text from a single worksheet XML string. +/// +/// Cells are output as tab-separated values per row, newline-separated per row. +fn extract_sheet_cells(xml: &str, shared_strings: &[String]) -> anyhow::Result { + use quick_xml::events::Event; + use quick_xml::Reader; + + let mut reader = Reader::from_str(xml); + let mut output = String::new(); + + let mut in_row = false; + let mut in_cell = false; + let mut in_value = false; + let mut cell_type = CellType::Number; + let mut cell_value = String::new(); + let mut row_cells: Vec = Vec::new(); + + loop { + match reader.read_event() { + Ok(Event::Start(e)) => { + let qname = e.name(); + let name = local_name(qname.as_ref()); + match name { + b"row" => { + in_row = true; + row_cells.clear(); + } + b"c" if in_row => { + in_cell = true; + cell_type = CellType::Number; + cell_value.clear(); + for attr in e.attributes().flatten() { + if attr.key.as_ref() == b"t" { + let val = attr.decode_and_unescape_value(reader.decoder())?; + cell_type = match val.as_ref() { + "s" => CellType::SharedString, + "inlineStr" => CellType::InlineString, + "b" => CellType::Boolean, + _ => CellType::Number, + }; + } + } + } + b"v" if in_cell => { + in_value = true; + } + b"t" if in_cell && cell_type == CellType::InlineString => { + // Inline string: text is inside ... + in_value = true; + } + _ => {} + } + } + Ok(Event::End(e)) => { + let qname = e.name(); + let name = local_name(qname.as_ref()); + match name { + b"row" => { + in_row = false; + if !row_cells.is_empty() { + if !output.is_empty() { + output.push('\n'); + } + output.push_str(&row_cells.join("\t")); + } + } + b"c" if in_cell => { + in_cell = false; + let resolved = match cell_type { + CellType::SharedString => { + if let Ok(idx) = cell_value.trim().parse::() { + shared_strings.get(idx).cloned().unwrap_or_default() + } else { + cell_value.clone() + } + } + CellType::Boolean => match cell_value.trim() { + "1" => "TRUE".to_string(), + "0" => "FALSE".to_string(), + other => other.to_string(), + }, + _ => cell_value.clone(), + }; + row_cells.push(resolved); + } + b"v" => { + in_value = false; + } + b"t" if in_cell => { + in_value = false; + } + _ => {} + } + } + Ok(Event::Text(e)) => { + if in_value { + cell_value.push_str(&e.unescape()?); + } + } + Ok(Event::Eof) => break, + Err(e) => return Err(e.into()), + _ => {} + } + } + + // Flush last row if not terminated by . + if in_row && !row_cells.is_empty() { + if !output.is_empty() { + output.push('\n'); + } + output.push_str(&row_cells.join("\t")); + } + + if !output.is_empty() { + output.push('\n'); + } + + Ok(output) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum CellType { + Number, + SharedString, + InlineString, + Boolean, +} + +fn sheet_numeric_index(sheet_path: &str) -> Option { + let stem = Path::new(sheet_path).file_stem()?.to_string_lossy(); + let digits = stem.strip_prefix("sheet")?; + digits.parse::().ok() +} + +fn local_name(name: &[u8]) -> &[u8] { + name.rsplit(|b| *b == b':').next().unwrap_or(name) +} + +fn normalize_sheet_target(target: &str) -> Option { + if target.contains("://") { + return None; + } + + let mut segments = Vec::new(); + for component in Path::new("xl").join(target).components() { + match component { + Component::Normal(part) => segments.push(part.to_string_lossy().to_string()), + Component::ParentDir => { + segments.pop()?; + } + _ => {} + } + } + + let normalized = segments.join("/"); + if normalized.starts_with("xl/worksheets/") && normalized.ends_with(".xml") { + Some(normalized) + } else { + None + } +} + +fn parse_max_chars(args: &serde_json::Value) -> anyhow::Result { + let Some(value) = args.get("max_chars") else { + return Ok(DEFAULT_MAX_CHARS); + }; + + let serde_json::Value::Number(number) = value else { + anyhow::bail!("Invalid 'max_chars': expected a positive integer"); + }; + let Some(raw) = number.as_u64() else { + anyhow::bail!("Invalid 'max_chars': expected a positive integer"); + }; + if raw == 0 { + anyhow::bail!("Invalid 'max_chars': must be >= 1"); + } + + Ok(usize::try_from(raw) + .unwrap_or(MAX_OUTPUT_CHARS) + .min(MAX_OUTPUT_CHARS)) +} + +#[async_trait] +impl Tool for XlsxReadTool { + fn name(&self) -> &str { + "xlsx_read" + } + + fn description(&self) -> &str { + "Extract plain text and numeric data from an XLSX (Excel) file in the workspace. \ + Returns tab-separated cell values per row for each sheet. \ + No formulas, charts, styles, or merged-cell awareness." + } + + fn parameters_schema(&self) -> serde_json::Value { + json!({ + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Path to the XLSX file. Relative paths resolve from workspace." + }, + "max_chars": { + "type": "integer", + "description": "Maximum characters to return (default: 50000, max: 200000)", + "minimum": 1, + "maximum": 200_000 + } + }, + "required": ["path"] + }) + } + + async fn execute(&self, args: serde_json::Value) -> anyhow::Result { + let path = args + .get("path") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?; + + let max_chars = match parse_max_chars(&args) { + Ok(value) => value, + Err(err) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(err.to_string()), + }) + } + }; + + if self.security.is_rate_limited() { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some("Rate limit exceeded: too many actions in the last hour".into()), + }); + } + + if !self.security.is_path_allowed(path) { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Path not allowed by security policy: {path}")), + }); + } + + if !self.security.record_action() { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some("Rate limit exceeded: action budget exhausted".into()), + }); + } + + let full_path = self.security.workspace_dir.join(path); + + let resolved_path = match tokio::fs::canonicalize(&full_path).await { + Ok(p) => p, + Err(e) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Failed to resolve file path: {e}")), + }); + } + }; + + if !self.security.is_resolved_path_allowed(&resolved_path) { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some( + self.security + .resolved_path_violation_message(&resolved_path), + ), + }); + } + + tracing::debug!("Reading XLSX: {}", resolved_path.display()); + + match tokio::fs::metadata(&resolved_path).await { + Ok(meta) => { + if meta.len() > MAX_XLSX_BYTES { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!( + "XLSX too large: {} bytes (limit: {MAX_XLSX_BYTES} bytes)", + meta.len() + )), + }); + } + } + Err(e) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Failed to read file metadata: {e}")), + }); + } + } + + let bytes = match tokio::fs::read(&resolved_path).await { + Ok(b) => b, + Err(e) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Failed to read XLSX file: {e}")), + }); + } + }; + + let text = match tokio::task::spawn_blocking(move || extract_xlsx_text(&bytes)).await { + Ok(Ok(t)) => t, + Ok(Err(e)) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("XLSX extraction failed: {e}")), + }); + } + Err(e) => { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("XLSX extraction task panicked: {e}")), + }); + } + }; + + if text.trim().is_empty() { + return Ok(ToolResult { + success: true, + output: "XLSX contains no extractable text".into(), + error: None, + }); + } + + let output = if text.chars().count() > max_chars { + let mut truncated: String = text.chars().take(max_chars).collect(); + use std::fmt::Write as _; + let _ = write!(truncated, "\n\n... [truncated at {max_chars} chars]"); + truncated + } else { + text + }; + + Ok(ToolResult { + success: true, + output, + error: None, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::security::{AutonomyLevel, SecurityPolicy}; + use tempfile::TempDir; + + fn test_security(workspace: std::path::PathBuf) -> Arc { + Arc::new(SecurityPolicy { + autonomy: AutonomyLevel::Supervised, + workspace_dir: workspace, + ..SecurityPolicy::default() + }) + } + + fn test_security_with_limit( + workspace: std::path::PathBuf, + max_actions: u32, + ) -> Arc { + Arc::new(SecurityPolicy { + autonomy: AutonomyLevel::Supervised, + workspace_dir: workspace, + max_actions_per_hour: max_actions, + ..SecurityPolicy::default() + }) + } + + /// Build a minimal valid XLSX (ZIP) in memory with one sheet containing + /// the given rows. Each inner `Vec<&str>` is a row of cell values. + fn minimal_xlsx_bytes(rows: &[Vec<&str>]) -> Vec { + use std::fmt::Write as FmtWrite; + use std::io::Write; + + // Build shared strings from all unique cell values. + let mut all_values: Vec = Vec::new(); + for row in rows { + for cell in row { + if !all_values.contains(&cell.to_string()) { + all_values.push(cell.to_string()); + } + } + } + + let mut ss_entries = String::new(); + for val in &all_values { + write!(ss_entries, "{val}").unwrap(); + } + let shared_strings_xml = format!( + r#" +{ss_entries}"#, + all_values.len(), + all_values.len() + ); + + // Build sheet XML. + let mut sheet_rows = String::new(); + for (r_idx, row) in rows.iter().enumerate() { + write!(sheet_rows, r#""#, r_idx + 1).unwrap(); + for (c_idx, cell) in row.iter().enumerate() { + #[allow(clippy::cast_possible_truncation)] + let col_letter = (b'A' + c_idx as u8) as char; + let cell_ref = format!("{}{}", col_letter, r_idx + 1); + let ss_idx = all_values.iter().position(|v| v == cell).unwrap(); + write!(sheet_rows, r#"{ss_idx}"#).unwrap(); + } + sheet_rows.push_str(""); + } + let sheet_xml = format!( + r#" + +{sheet_rows} +"# + ); + + let workbook_xml = r#" + + +"#; + + let rels_xml = r#" + + +"#; + + let buf = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(buf); + let options = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + + zip.start_file("xl/sharedStrings.xml", options).unwrap(); + zip.write_all(shared_strings_xml.as_bytes()).unwrap(); + + zip.start_file("xl/workbook.xml", options).unwrap(); + zip.write_all(workbook_xml.as_bytes()).unwrap(); + + zip.start_file("xl/_rels/workbook.xml.rels", options) + .unwrap(); + zip.write_all(rels_xml.as_bytes()).unwrap(); + + zip.start_file("xl/worksheets/sheet1.xml", options).unwrap(); + zip.write_all(sheet_xml.as_bytes()).unwrap(); + + zip.finish().unwrap().into_inner() + } + + /// Build an XLSX with two sheets. + fn two_sheet_xlsx_bytes( + sheet1_name: &str, + sheet1_rows: &[Vec<&str>], + sheet2_name: &str, + sheet2_rows: &[Vec<&str>], + ) -> Vec { + use std::fmt::Write as FmtWrite; + use std::io::Write; + + // Collect all unique values across both sheets. + let mut all_values: Vec = Vec::new(); + for rows in [sheet1_rows, sheet2_rows] { + for row in rows { + for cell in row { + if !all_values.contains(&cell.to_string()) { + all_values.push(cell.to_string()); + } + } + } + } + + let mut ss_entries = String::new(); + for val in &all_values { + write!(ss_entries, "{val}").unwrap(); + } + let shared_strings_xml = format!( + r#" +{ss_entries}"#, + all_values.len(), + all_values.len() + ); + + let build_sheet = |rows: &[Vec<&str>]| -> String { + let mut sheet_rows = String::new(); + for (r_idx, row) in rows.iter().enumerate() { + write!(sheet_rows, r#""#, r_idx + 1).unwrap(); + for (c_idx, cell) in row.iter().enumerate() { + #[allow(clippy::cast_possible_truncation)] + let col_letter = (b'A' + c_idx as u8) as char; + let cell_ref = format!("{}{}", col_letter, r_idx + 1); + let ss_idx = all_values.iter().position(|v| v == cell).unwrap(); + write!(sheet_rows, r#"{ss_idx}"#).unwrap(); + } + sheet_rows.push_str(""); + } + format!( + r#" + +{sheet_rows} +"# + ) + }; + + let workbook_xml = format!( + r#" + + + + + +"# + ); + + let rels_xml = r#" + + + +"#; + + let buf = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(buf); + let options = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + + zip.start_file("xl/sharedStrings.xml", options).unwrap(); + zip.write_all(shared_strings_xml.as_bytes()).unwrap(); + + zip.start_file("xl/workbook.xml", options).unwrap(); + zip.write_all(workbook_xml.as_bytes()).unwrap(); + + zip.start_file("xl/_rels/workbook.xml.rels", options) + .unwrap(); + zip.write_all(rels_xml.as_bytes()).unwrap(); + + zip.start_file("xl/worksheets/sheet1.xml", options).unwrap(); + zip.write_all(build_sheet(sheet1_rows).as_bytes()).unwrap(); + + zip.start_file("xl/worksheets/sheet2.xml", options).unwrap(); + zip.write_all(build_sheet(sheet2_rows).as_bytes()).unwrap(); + + zip.finish().unwrap().into_inner() + } + + #[test] + fn name_is_xlsx_read() { + let tool = XlsxReadTool::new(test_security(std::env::temp_dir())); + assert_eq!(tool.name(), "xlsx_read"); + } + + #[test] + fn description_not_empty() { + let tool = XlsxReadTool::new(test_security(std::env::temp_dir())); + assert!(!tool.description().is_empty()); + } + + #[test] + fn schema_has_path_required() { + let tool = XlsxReadTool::new(test_security(std::env::temp_dir())); + let schema = tool.parameters_schema(); + assert!(schema["properties"]["path"].is_object()); + assert!(schema["properties"]["max_chars"].is_object()); + let required = schema["required"].as_array().unwrap(); + assert!(required.contains(&json!("path"))); + } + + #[test] + fn spec_matches_metadata() { + let tool = XlsxReadTool::new(test_security(std::env::temp_dir())); + let spec = tool.spec(); + assert_eq!(spec.name, "xlsx_read"); + assert!(spec.parameters.is_object()); + } + + #[tokio::test] + async fn missing_path_param_returns_error() { + let tool = XlsxReadTool::new(test_security(std::env::temp_dir())); + let result = tool.execute(json!({})).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("path")); + } + + #[tokio::test] + async fn absolute_path_is_blocked() { + let tool = XlsxReadTool::new(test_security(std::env::temp_dir())); + let result = tool.execute(json!({"path": "/etc/passwd"})).await.unwrap(); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("not allowed")); + } + + #[tokio::test] + async fn path_traversal_is_blocked() { + let tmp = TempDir::new().unwrap(); + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool + .execute(json!({"path": "../../../etc/passwd"})) + .await + .unwrap(); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("not allowed")); + } + + #[tokio::test] + async fn nonexistent_file_returns_error() { + let tmp = TempDir::new().unwrap(); + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool.execute(json!({"path": "missing.xlsx"})).await.unwrap(); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("Failed to resolve")); + } + + #[tokio::test] + async fn rate_limit_blocks_request() { + let tmp = TempDir::new().unwrap(); + let tool = XlsxReadTool::new(test_security_with_limit(tmp.path().to_path_buf(), 0)); + let result = tool.execute(json!({"path": "any.xlsx"})).await.unwrap(); + assert!(!result.success); + assert!(result.error.as_deref().unwrap_or("").contains("Rate limit")); + } + + #[tokio::test] + async fn extracts_text_from_valid_xlsx() { + let tmp = TempDir::new().unwrap(); + let xlsx_path = tmp.path().join("data.xlsx"); + let rows = vec![vec!["Name", "Age"], vec!["Alice", "30"]]; + tokio::fs::write(&xlsx_path, minimal_xlsx_bytes(&rows)) + .await + .unwrap(); + + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool.execute(json!({"path": "data.xlsx"})).await.unwrap(); + assert!(result.success, "error: {:?}", result.error); + assert!( + result.output.contains("Name"), + "expected 'Name' in output, got: {}", + result.output + ); + assert!(result.output.contains("Age")); + assert!(result.output.contains("Alice")); + assert!(result.output.contains("30")); + } + + #[tokio::test] + async fn extracts_tab_separated_columns() { + let tmp = TempDir::new().unwrap(); + let xlsx_path = tmp.path().join("cols.xlsx"); + let rows = vec![vec!["A", "B", "C"]]; + tokio::fs::write(&xlsx_path, minimal_xlsx_bytes(&rows)) + .await + .unwrap(); + + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool.execute(json!({"path": "cols.xlsx"})).await.unwrap(); + assert!(result.success); + assert!( + result.output.contains("A\tB\tC"), + "expected tab-separated output, got: {:?}", + result.output + ); + } + + #[tokio::test] + async fn extracts_multiple_sheets() { + let tmp = TempDir::new().unwrap(); + let xlsx_path = tmp.path().join("multi.xlsx"); + let bytes = two_sheet_xlsx_bytes( + "Sales", + &[vec!["Product", "Revenue"], vec!["Widget", "1000"]], + "Costs", + &[vec!["Item", "Amount"], vec!["Rent", "500"]], + ); + tokio::fs::write(&xlsx_path, bytes).await.unwrap(); + + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool.execute(json!({"path": "multi.xlsx"})).await.unwrap(); + assert!(result.success, "error: {:?}", result.error); + assert!(result.output.contains("--- Sheet: Sales ---")); + assert!(result.output.contains("--- Sheet: Costs ---")); + assert!(result.output.contains("Widget")); + assert!(result.output.contains("Rent")); + } + + #[tokio::test] + async fn invalid_zip_returns_extraction_error() { + let tmp = TempDir::new().unwrap(); + let xlsx_path = tmp.path().join("bad.xlsx"); + tokio::fs::write(&xlsx_path, b"this is not a zip file") + .await + .unwrap(); + + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool.execute(json!({"path": "bad.xlsx"})).await.unwrap(); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("extraction failed")); + } + + #[tokio::test] + async fn max_chars_truncates_output() { + let tmp = TempDir::new().unwrap(); + let long_text = "B".repeat(200); + let rows = vec![vec![long_text.as_str(); 10]]; + let xlsx_path = tmp.path().join("long.xlsx"); + tokio::fs::write(&xlsx_path, minimal_xlsx_bytes(&rows)) + .await + .unwrap(); + + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool + .execute(json!({"path": "long.xlsx", "max_chars": 50})) + .await + .unwrap(); + assert!(result.success); + assert!(result.output.contains("truncated")); + } + + #[tokio::test] + async fn invalid_max_chars_returns_tool_error() { + let tmp = TempDir::new().unwrap(); + let xlsx_path = tmp.path().join("data.xlsx"); + let rows = vec![vec!["Hello"]]; + tokio::fs::write(&xlsx_path, minimal_xlsx_bytes(&rows)) + .await + .unwrap(); + + let tool = XlsxReadTool::new(test_security(tmp.path().to_path_buf())); + let result = tool + .execute(json!({"path": "data.xlsx", "max_chars": "100"})) + .await + .unwrap(); + assert!(!result.success); + assert!(result.error.as_deref().unwrap_or("").contains("max_chars")); + } + + #[test] + fn shared_string_reference_resolved() { + let rows = vec![vec!["Hello", "World"]]; + let bytes = minimal_xlsx_bytes(&rows); + let text = extract_xlsx_text(&bytes).unwrap(); + assert!(text.contains("Hello")); + assert!(text.contains("World")); + } + + #[test] + fn cumulative_sheet_xml_limit_is_enforced() { + let rows = vec![vec!["Alpha", "Beta"]]; + let bytes = minimal_xlsx_bytes(&rows); + let error = extract_xlsx_text_with_limits(&bytes, 64).unwrap_err(); + assert!(error.to_string().contains("Sheet XML payload too large")); + } + + #[test] + fn numeric_cells_extracted_directly() { + use std::io::Write; + + // Build a sheet with numeric cells (no t="s" attribute). + let sheet_xml = r#" + + +423.14 + +"#; + + let workbook_xml = r#" + + +"#; + + let rels_xml = r#" + + +"#; + + let buf = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(buf); + let options = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + + zip.start_file("xl/workbook.xml", options).unwrap(); + zip.write_all(workbook_xml.as_bytes()).unwrap(); + zip.start_file("xl/_rels/workbook.xml.rels", options) + .unwrap(); + zip.write_all(rels_xml.as_bytes()).unwrap(); + zip.start_file("xl/worksheets/sheet1.xml", options).unwrap(); + zip.write_all(sheet_xml.as_bytes()).unwrap(); + + let bytes = zip.finish().unwrap().into_inner(); + let text = extract_xlsx_text(&bytes).unwrap(); + assert!(text.contains("42"), "got: {text}"); + assert!(text.contains("3.14"), "got: {text}"); + assert!(text.contains("42\t3.14"), "got: {text}"); + } + + #[test] + fn fallback_when_no_workbook() { + use std::io::Write; + + // ZIP with only sheet files, no workbook.xml. + let sheet_xml = r#" + + +99 + +"#; + + let buf = std::io::Cursor::new(Vec::new()); + let mut zip = zip::ZipWriter::new(buf); + let options = zip::write::SimpleFileOptions::default() + .compression_method(zip::CompressionMethod::Stored); + + zip.start_file("xl/worksheets/sheet1.xml", options).unwrap(); + zip.write_all(sheet_xml.as_bytes()).unwrap(); + + let bytes = zip.finish().unwrap().into_inner(); + let text = extract_xlsx_text(&bytes).unwrap(); + assert!(text.contains("99"), "got: {text}"); + } + + #[cfg(unix)] + #[tokio::test] + async fn symlink_escape_is_blocked() { + use std::os::unix::fs::symlink; + + let root = TempDir::new().unwrap(); + let workspace = root.path().join("workspace"); + let outside = root.path().join("outside"); + tokio::fs::create_dir_all(&workspace).await.unwrap(); + tokio::fs::create_dir_all(&outside).await.unwrap(); + let rows = vec![vec!["secret"]]; + tokio::fs::write(outside.join("secret.xlsx"), minimal_xlsx_bytes(&rows)) + .await + .unwrap(); + symlink(outside.join("secret.xlsx"), workspace.join("link.xlsx")).unwrap(); + + let tool = XlsxReadTool::new(test_security(workspace)); + let result = tool.execute(json!({"path": "link.xlsx"})).await.unwrap(); + assert!(!result.success); + assert!(result + .error + .as_deref() + .unwrap_or("") + .contains("escapes workspace")); + } +}