diff --git a/docs/config-reference.md b/docs/config-reference.md index 7b5bedcdf..5cd964ef7 100644 --- a/docs/config-reference.md +++ b/docs/config-reference.md @@ -456,12 +456,52 @@ Notes: | `allowed_domains` | `[]` | Allowed domains for HTTP requests (exact/subdomain match, or `"*"` for all public domains) | | `max_response_size` | `1000000` | Maximum response size in bytes (default: 1 MB) | | `timeout_secs` | `30` | Request timeout in seconds | +| `user_agent` | `ZeroClaw/1.0` | User-Agent header for outbound HTTP requests | Notes: - Deny-by-default: if `allowed_domains` is empty, all HTTP requests are rejected. - Use exact domain or subdomain matching (e.g. `"api.example.com"`, `"example.com"`), or `"*"` to allow any public domain. - Local/private targets are still blocked even when `"*"` is configured. +- Shell `curl`/`wget` are classified as high-risk and may be blocked by autonomy policy. Prefer `http_request` for direct HTTP calls. + +## `[web_fetch]` + +| Key | Default | Purpose | +|---|---|---| +| `enabled` | `false` | Enable `web_fetch` for page-to-text extraction | +| `provider` | `fast_html2md` | Fetch/render backend: `fast_html2md`, `nanohtml2text`, `firecrawl` | +| `api_key` | unset | API key for provider backends that require it (e.g. `firecrawl`) | +| `api_url` | unset | Optional API URL override (self-hosted/alternate endpoint) | +| `allowed_domains` | `["*"]` | Domain allowlist (`"*"` allows all public domains) | +| `blocked_domains` | `[]` | Denylist applied before allowlist | +| `max_response_size` | `500000` | Maximum returned payload size in bytes | +| `timeout_secs` | `30` | Request timeout in seconds | +| `user_agent` | `ZeroClaw/1.0` | User-Agent header for fetch requests | + +Notes: + +- `web_fetch` is optimized for summarization/data extraction from web pages. +- Redirect targets are revalidated against allow/deny domain policy. +- Local/private network targets remain blocked even when `allowed_domains = ["*"]`. + +## `[web_search]` + +| Key | Default | Purpose | +|---|---|---| +| `enabled` | `false` | Enable `web_search_tool` | +| `provider` | `duckduckgo` | Search backend: `duckduckgo`, `brave`, `firecrawl` | +| `api_key` | unset | Generic provider key (used by `firecrawl`, fallback for `brave`) | +| `api_url` | unset | Optional API URL override | +| `brave_api_key` | unset | Dedicated Brave key (required for `provider = "brave"` unless `api_key` is set) | +| `max_results` | `5` | Maximum search results returned (clamped to 1-10) | +| `timeout_secs` | `15` | Request timeout in seconds | +| `user_agent` | `ZeroClaw/1.0` | User-Agent header for search requests | + +Notes: + +- If DuckDuckGo returns `403`/`429` in your network, switch provider to `brave` or `firecrawl`. +- `web_search` finds candidate URLs; pair it with `web_fetch` for page content extraction. ## `[gateway]` diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 903ab409c..c72826fee 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -192,6 +192,97 @@ zeroclaw channel doctor Then verify channel-specific credentials + allowlist fields in config. +## Web Access Issues + +### `curl`/`wget` blocked in shell tool + +Symptom: + +- tool output includes `Command blocked: high-risk command is disallowed by policy` +- model says `curl`/`wget` is blocked + +Why this happens: + +- `curl`/`wget` are high-risk shell commands and may be blocked by autonomy policy. + +Preferred fix: + +- use purpose-built tools instead of shell fetch: + - `http_request` for direct API/HTTP calls + - `web_fetch` for page content extraction/summarization + +Minimal config: + +```toml +[http_request] +enabled = true +allowed_domains = ["*"] + +[web_fetch] +enabled = true +provider = "fast_html2md" +allowed_domains = ["*"] +``` + +### `web_search_tool` fails with `403`/`429` + +Symptom: + +- tool output includes `DuckDuckGo search failed with status: 403` (or `429`) + +Why this happens: + +- some networks/proxies/rate limits block DuckDuckGo HTML search endpoint traffic. + +Fix options: + +1. Switch provider to Brave (recommended when you have an API key): + +```toml +[web_search] +enabled = true +provider = "brave" +brave_api_key = "" +``` + +2. Switch provider to Firecrawl (if enabled in your build): + +```toml +[web_search] +enabled = true +provider = "firecrawl" +api_key = "" +``` + +3. Keep DuckDuckGo for search, but use `web_fetch` to read pages once you have URLs. + +### `web_fetch`/`http_request` says host is not allowed + +Symptom: + +- errors like `Host '' is not in http_request.allowed_domains` +- or `web_fetch tool is enabled but no allowed_domains are configured` + +Fix: + +- include exact domains or `"*"` for public internet access: + +```toml +[http_request] +enabled = true +allowed_domains = ["*"] + +[web_fetch] +enabled = true +allowed_domains = ["*"] +blocked_domains = [] +``` + +Security notes: + +- local/private network targets are blocked even with `"*"` +- keep explicit domain allowlists in production environments when possible + ## Service Mode ### Service installed but not running diff --git a/src/security/policy.rs b/src/security/policy.rs index 3c4c40a66..819e151a7 100644 --- a/src/security/policy.rs +++ b/src/security/policy.rs @@ -700,6 +700,13 @@ impl SecurityPolicy { if risk == CommandRiskLevel::High { if self.block_high_risk_commands { + let lower = command.to_ascii_lowercase(); + if lower.contains("curl") || lower.contains("wget") { + return Err( + "Command blocked: high-risk command is disallowed by policy. Shell curl/wget are blocked; use `http_request` or `web_fetch` with configured allowed_domains." + .into(), + ); + } return Err("Command blocked: high-risk command is disallowed by policy".into()); } if self.autonomy == AutonomyLevel::Supervised && !approved { diff --git a/src/tools/web_search_tool.rs b/src/tools/web_search_tool.rs index b869d26ce..28ccb3494 100644 --- a/src/tools/web_search_tool.rs +++ b/src/tools/web_search_tool.rs @@ -2,6 +2,7 @@ use super::traits::{Tool, ToolResult}; use crate::security::SecurityPolicy; use async_trait::async_trait; use regex::Regex; +use reqwest::StatusCode; use serde_json::json; use std::sync::Arc; use std::time::Duration; @@ -19,6 +20,18 @@ pub struct WebSearchTool { } impl WebSearchTool { + fn duckduckgo_status_hint(status: StatusCode) -> &'static str { + match status { + StatusCode::FORBIDDEN | StatusCode::TOO_MANY_REQUESTS => { + " DuckDuckGo may be blocking this network. Try [web_search].provider = \"brave\" with [web_search].brave_api_key, or set provider = \"firecrawl\"." + } + StatusCode::SERVICE_UNAVAILABLE | StatusCode::BAD_GATEWAY | StatusCode::GATEWAY_TIMEOUT => { + " DuckDuckGo may be temporarily unavailable. Retry later or switch providers." + } + _ => "", + } + } + pub fn new( security: Arc, provider: String, @@ -48,12 +61,18 @@ impl WebSearchTool { .user_agent(self.user_agent.as_str()) .build()?; - let response = client.get(&search_url).send().await?; + let response = client.get(&search_url).send().await.map_err(|e| { + anyhow::anyhow!( + "DuckDuckGo search request failed: {e}. Check outbound network/proxy settings, or switch [web_search].provider to \"brave\"/\"firecrawl\"." + ) + })?; if !response.status().is_success() { + let status = response.status(); anyhow::bail!( - "DuckDuckGo search failed with status: {}", - response.status() + "DuckDuckGo search failed with status: {}.{}", + status, + Self::duckduckgo_status_hint(status) ); } @@ -484,6 +503,20 @@ mod tests { assert!(!result.contains("rut=test")); } + #[test] + fn duckduckgo_status_hint_for_403_mentions_provider_switch() { + let hint = WebSearchTool::duckduckgo_status_hint(StatusCode::FORBIDDEN); + assert!(hint.contains("provider")); + assert!(hint.contains("brave")); + } + + #[test] + fn duckduckgo_status_hint_for_500_is_empty() { + assert!( + WebSearchTool::duckduckgo_status_hint(StatusCode::INTERNAL_SERVER_ERROR).is_empty() + ); + } + #[test] fn test_constructor_clamps_web_search_limits() { let tool = WebSearchTool::new(