fix(web): improve web access guidance and search failure diagnostics

2026-02-25 18:40:44 -05:00 · 2026-02-25 18:40:44 -05:00 · ffaf927690
commit ffaf927690
parent 6ce47af3d6
4 changed files with 174 additions and 3 deletions
--- a/docs/config-reference.md
+++ b/docs/config-reference.md
@ -456,12 +456,52 @@ Notes:
 | `allowed_domains` | `[]` | Allowed domains for HTTP requests (exact/subdomain match, or `"*"` for all public domains) |
 | `max_response_size` | `1000000` | Maximum response size in bytes (default: 1 MB) |
 | `timeout_secs` | `30` | Request timeout in seconds |
+| `user_agent` | `ZeroClaw/1.0` | User-Agent header for outbound HTTP requests |

 Notes:

 - Deny-by-default: if `allowed_domains` is empty, all HTTP requests are rejected.
 - Use exact domain or subdomain matching (e.g. `"api.example.com"`, `"example.com"`), or `"*"` to allow any public domain.
 - Local/private targets are still blocked even when `"*"` is configured.
+- Shell `curl`/`wget` are classified as high-risk and may be blocked by autonomy policy. Prefer `http_request` for direct HTTP calls.
+
+## `[web_fetch]`
+
+| Key | Default | Purpose |
+|---|---|---|
+| `enabled` | `false` | Enable `web_fetch` for page-to-text extraction |
+| `provider` | `fast_html2md` | Fetch/render backend: `fast_html2md`, `nanohtml2text`, `firecrawl` |
+| `api_key` | unset | API key for provider backends that require it (e.g. `firecrawl`) |
+| `api_url` | unset | Optional API URL override (self-hosted/alternate endpoint) |
+| `allowed_domains` | `["*"]` | Domain allowlist (`"*"` allows all public domains) |
+| `blocked_domains` | `[]` | Denylist applied before allowlist |
+| `max_response_size` | `500000` | Maximum returned payload size in bytes |
+| `timeout_secs` | `30` | Request timeout in seconds |
+| `user_agent` | `ZeroClaw/1.0` | User-Agent header for fetch requests |
+
+Notes:
+
+- `web_fetch` is optimized for summarization/data extraction from web pages.
+- Redirect targets are revalidated against allow/deny domain policy.
+- Local/private network targets remain blocked even when `allowed_domains = ["*"]`.
+
+## `[web_search]`
+
+| Key | Default | Purpose |
+|---|---|---|
+| `enabled` | `false` | Enable `web_search_tool` |
+| `provider` | `duckduckgo` | Search backend: `duckduckgo`, `brave`, `firecrawl` |
+| `api_key` | unset | Generic provider key (used by `firecrawl`, fallback for `brave`) |
+| `api_url` | unset | Optional API URL override |
+| `brave_api_key` | unset | Dedicated Brave key (required for `provider = "brave"` unless `api_key` is set) |
+| `max_results` | `5` | Maximum search results returned (clamped to 1-10) |
+| `timeout_secs` | `15` | Request timeout in seconds |
+| `user_agent` | `ZeroClaw/1.0` | User-Agent header for search requests |
+
+Notes:
+
+- If DuckDuckGo returns `403`/`429` in your network, switch provider to `brave` or `firecrawl`.
+- `web_search` finds candidate URLs; pair it with `web_fetch` for page content extraction.

 ## `[gateway]`

--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
@ -192,6 +192,97 @@ zeroclaw channel doctor

 Then verify channel-specific credentials + allowlist fields in config.

+## Web Access Issues
+
+### `curl`/`wget` blocked in shell tool
+
+Symptom:
+
+- tool output includes `Command blocked: high-risk command is disallowed by policy`
+- model says `curl`/`wget` is blocked
+
+Why this happens:
+
+- `curl`/`wget` are high-risk shell commands and may be blocked by autonomy policy.
+
+Preferred fix:
+
+- use purpose-built tools instead of shell fetch:
+  - `http_request` for direct API/HTTP calls
+  - `web_fetch` for page content extraction/summarization
+
+Minimal config:
+
+```toml
+[http_request]
+enabled = true
+allowed_domains = ["*"]
+
+[web_fetch]
+enabled = true
+provider = "fast_html2md"
+allowed_domains = ["*"]
+```
+
+### `web_search_tool` fails with `403`/`429`
+
+Symptom:
+
+- tool output includes `DuckDuckGo search failed with status: 403` (or `429`)
+
+Why this happens:
+
+- some networks/proxies/rate limits block DuckDuckGo HTML search endpoint traffic.
+
+Fix options:
+
+1. Switch provider to Brave (recommended when you have an API key):
+
+```toml
+[web_search]
+enabled = true
+provider = "brave"
+brave_api_key = "<SECRET>"
+```
+
+2. Switch provider to Firecrawl (if enabled in your build):
+
+```toml
+[web_search]
+enabled = true
+provider = "firecrawl"
+api_key = "<SECRET>"
+```
+
+3. Keep DuckDuckGo for search, but use `web_fetch` to read pages once you have URLs.
+
+### `web_fetch`/`http_request` says host is not allowed
+
+Symptom:
+
+- errors like `Host '<domain>' is not in http_request.allowed_domains`
+- or `web_fetch tool is enabled but no allowed_domains are configured`
+
+Fix:
+
+- include exact domains or `"*"` for public internet access:
+
+```toml
+[http_request]
+enabled = true
+allowed_domains = ["*"]
+
+[web_fetch]
+enabled = true
+allowed_domains = ["*"]
+blocked_domains = []
+```
+
+Security notes:
+
+- local/private network targets are blocked even with `"*"`
+- keep explicit domain allowlists in production environments when possible
+
 ## Service Mode

 ### Service installed but not running
--- a/src/security/policy.rs
+++ b/src/security/policy.rs
@ -700,6 +700,13 @@ impl SecurityPolicy {

        if risk == CommandRiskLevel::High {
            if self.block_high_risk_commands {
+                let lower = command.to_ascii_lowercase();
+                if lower.contains("curl") || lower.contains("wget") {
+                    return Err(
+                        "Command blocked: high-risk command is disallowed by policy. Shell curl/wget are blocked; use `http_request` or `web_fetch` with configured allowed_domains."
+                            .into(),
+                    );
+                }
                return Err("Command blocked: high-risk command is disallowed by policy".into());
            }
            if self.autonomy == AutonomyLevel::Supervised && !approved {
--- a/src/tools/web_search_tool.rs
+++ b/src/tools/web_search_tool.rs
@ -2,6 +2,7 @@ use super::traits::{Tool, ToolResult};
 use crate::security::SecurityPolicy;
 use async_trait::async_trait;
 use regex::Regex;
+use reqwest::StatusCode;
 use serde_json::json;
 use std::sync::Arc;
 use std::time::Duration;
@ -19,6 +20,18 @@ pub struct WebSearchTool {
 }

 impl WebSearchTool {
+    fn duckduckgo_status_hint(status: StatusCode) -> &'static str {
+        match status {
+            StatusCode::FORBIDDEN | StatusCode::TOO_MANY_REQUESTS => {
+                " DuckDuckGo may be blocking this network. Try [web_search].provider = \"brave\" with [web_search].brave_api_key, or set provider = \"firecrawl\"."
+            }
+            StatusCode::SERVICE_UNAVAILABLE | StatusCode::BAD_GATEWAY | StatusCode::GATEWAY_TIMEOUT => {
+                " DuckDuckGo may be temporarily unavailable. Retry later or switch providers."
+            }
+            _ => "",
+        }
+    }
+
    pub fn new(
        security: Arc<SecurityPolicy>,
        provider: String,
@ -48,12 +61,18 @@ impl WebSearchTool {
            .user_agent(self.user_agent.as_str())
            .build()?;

-        let response = client.get(&search_url).send().await?;
+        let response = client.get(&search_url).send().await.map_err(|e| {
+            anyhow::anyhow!(
+                "DuckDuckGo search request failed: {e}. Check outbound network/proxy settings, or switch [web_search].provider to \"brave\"/\"firecrawl\"."
+            )
+        })?;

        if !response.status().is_success() {
+            let status = response.status();
            anyhow::bail!(
-                "DuckDuckGo search failed with status: {}",
-                response.status()
+                "DuckDuckGo search failed with status: {}.{}",
+                status,
+                Self::duckduckgo_status_hint(status)
            );
        }

@ -484,6 +503,20 @@ mod tests {
        assert!(!result.contains("rut=test"));
    }

+    #[test]
+    fn duckduckgo_status_hint_for_403_mentions_provider_switch() {
+        let hint = WebSearchTool::duckduckgo_status_hint(StatusCode::FORBIDDEN);
+        assert!(hint.contains("provider"));
+        assert!(hint.contains("brave"));
+    }
+
+    #[test]
+    fn duckduckgo_status_hint_for_500_is_empty() {
+        assert!(
+            WebSearchTool::duckduckgo_status_hint(StatusCode::INTERNAL_SERVER_ERROR).is_empty()
+        );
+    }
+
    #[test]
    fn test_constructor_clamps_web_search_limits() {
        let tool = WebSearchTool::new(