fix(web): improve web access guidance and search failure diagnostics

This commit is contained in:
argenis de la rosa 2026-02-25 18:40:44 -05:00 committed by Argenis
parent 6ce47af3d6
commit ffaf927690
4 changed files with 174 additions and 3 deletions

View File

@ -456,12 +456,52 @@ Notes:
| `allowed_domains` | `[]` | Allowed domains for HTTP requests (exact/subdomain match, or `"*"` for all public domains) |
| `max_response_size` | `1000000` | Maximum response size in bytes (default: 1 MB) |
| `timeout_secs` | `30` | Request timeout in seconds |
| `user_agent` | `ZeroClaw/1.0` | User-Agent header for outbound HTTP requests |
Notes:
- Deny-by-default: if `allowed_domains` is empty, all HTTP requests are rejected.
- Use exact domain or subdomain matching (e.g. `"api.example.com"`, `"example.com"`), or `"*"` to allow any public domain.
- Local/private targets are still blocked even when `"*"` is configured.
- Shell `curl`/`wget` are classified as high-risk and may be blocked by autonomy policy. Prefer `http_request` for direct HTTP calls.
## `[web_fetch]`
| Key | Default | Purpose |
|---|---|---|
| `enabled` | `false` | Enable `web_fetch` for page-to-text extraction |
| `provider` | `fast_html2md` | Fetch/render backend: `fast_html2md`, `nanohtml2text`, `firecrawl` |
| `api_key` | unset | API key for provider backends that require it (e.g. `firecrawl`) |
| `api_url` | unset | Optional API URL override (self-hosted/alternate endpoint) |
| `allowed_domains` | `["*"]` | Domain allowlist (`"*"` allows all public domains) |
| `blocked_domains` | `[]` | Denylist applied before allowlist |
| `max_response_size` | `500000` | Maximum returned payload size in bytes |
| `timeout_secs` | `30` | Request timeout in seconds |
| `user_agent` | `ZeroClaw/1.0` | User-Agent header for fetch requests |
Notes:
- `web_fetch` is optimized for summarization/data extraction from web pages.
- Redirect targets are revalidated against allow/deny domain policy.
- Local/private network targets remain blocked even when `allowed_domains = ["*"]`.
## `[web_search]`
| Key | Default | Purpose |
|---|---|---|
| `enabled` | `false` | Enable `web_search_tool` |
| `provider` | `duckduckgo` | Search backend: `duckduckgo`, `brave`, `firecrawl` |
| `api_key` | unset | Generic provider key (used by `firecrawl`, fallback for `brave`) |
| `api_url` | unset | Optional API URL override |
| `brave_api_key` | unset | Dedicated Brave key (required for `provider = "brave"` unless `api_key` is set) |
| `max_results` | `5` | Maximum search results returned (clamped to 1-10) |
| `timeout_secs` | `15` | Request timeout in seconds |
| `user_agent` | `ZeroClaw/1.0` | User-Agent header for search requests |
Notes:
- If DuckDuckGo returns `403`/`429` in your network, switch provider to `brave` or `firecrawl`.
- `web_search` finds candidate URLs; pair it with `web_fetch` for page content extraction.
## `[gateway]`

View File

@ -192,6 +192,97 @@ zeroclaw channel doctor
Then verify channel-specific credentials + allowlist fields in config.
## Web Access Issues
### `curl`/`wget` blocked in shell tool
Symptom:
- tool output includes `Command blocked: high-risk command is disallowed by policy`
- model says `curl`/`wget` is blocked
Why this happens:
- `curl`/`wget` are high-risk shell commands and may be blocked by autonomy policy.
Preferred fix:
- use purpose-built tools instead of shell fetch:
- `http_request` for direct API/HTTP calls
- `web_fetch` for page content extraction/summarization
Minimal config:
```toml
[http_request]
enabled = true
allowed_domains = ["*"]
[web_fetch]
enabled = true
provider = "fast_html2md"
allowed_domains = ["*"]
```
### `web_search_tool` fails with `403`/`429`
Symptom:
- tool output includes `DuckDuckGo search failed with status: 403` (or `429`)
Why this happens:
- some networks/proxies/rate limits block DuckDuckGo HTML search endpoint traffic.
Fix options:
1. Switch provider to Brave (recommended when you have an API key):
```toml
[web_search]
enabled = true
provider = "brave"
brave_api_key = "<SECRET>"
```
2. Switch provider to Firecrawl (if enabled in your build):
```toml
[web_search]
enabled = true
provider = "firecrawl"
api_key = "<SECRET>"
```
3. Keep DuckDuckGo for search, but use `web_fetch` to read pages once you have URLs.
### `web_fetch`/`http_request` says host is not allowed
Symptom:
- errors like `Host '<domain>' is not in http_request.allowed_domains`
- or `web_fetch tool is enabled but no allowed_domains are configured`
Fix:
- include exact domains or `"*"` for public internet access:
```toml
[http_request]
enabled = true
allowed_domains = ["*"]
[web_fetch]
enabled = true
allowed_domains = ["*"]
blocked_domains = []
```
Security notes:
- local/private network targets are blocked even with `"*"`
- keep explicit domain allowlists in production environments when possible
## Service Mode
### Service installed but not running

View File

@ -700,6 +700,13 @@ impl SecurityPolicy {
if risk == CommandRiskLevel::High {
if self.block_high_risk_commands {
let lower = command.to_ascii_lowercase();
if lower.contains("curl") || lower.contains("wget") {
return Err(
"Command blocked: high-risk command is disallowed by policy. Shell curl/wget are blocked; use `http_request` or `web_fetch` with configured allowed_domains."
.into(),
);
}
return Err("Command blocked: high-risk command is disallowed by policy".into());
}
if self.autonomy == AutonomyLevel::Supervised && !approved {

View File

@ -2,6 +2,7 @@ use super::traits::{Tool, ToolResult};
use crate::security::SecurityPolicy;
use async_trait::async_trait;
use regex::Regex;
use reqwest::StatusCode;
use serde_json::json;
use std::sync::Arc;
use std::time::Duration;
@ -19,6 +20,18 @@ pub struct WebSearchTool {
}
impl WebSearchTool {
fn duckduckgo_status_hint(status: StatusCode) -> &'static str {
match status {
StatusCode::FORBIDDEN | StatusCode::TOO_MANY_REQUESTS => {
" DuckDuckGo may be blocking this network. Try [web_search].provider = \"brave\" with [web_search].brave_api_key, or set provider = \"firecrawl\"."
}
StatusCode::SERVICE_UNAVAILABLE | StatusCode::BAD_GATEWAY | StatusCode::GATEWAY_TIMEOUT => {
" DuckDuckGo may be temporarily unavailable. Retry later or switch providers."
}
_ => "",
}
}
pub fn new(
security: Arc<SecurityPolicy>,
provider: String,
@ -48,12 +61,18 @@ impl WebSearchTool {
.user_agent(self.user_agent.as_str())
.build()?;
let response = client.get(&search_url).send().await?;
let response = client.get(&search_url).send().await.map_err(|e| {
anyhow::anyhow!(
"DuckDuckGo search request failed: {e}. Check outbound network/proxy settings, or switch [web_search].provider to \"brave\"/\"firecrawl\"."
)
})?;
if !response.status().is_success() {
let status = response.status();
anyhow::bail!(
"DuckDuckGo search failed with status: {}",
response.status()
"DuckDuckGo search failed with status: {}.{}",
status,
Self::duckduckgo_status_hint(status)
);
}
@ -484,6 +503,20 @@ mod tests {
assert!(!result.contains("rut=test"));
}
#[test]
fn duckduckgo_status_hint_for_403_mentions_provider_switch() {
let hint = WebSearchTool::duckduckgo_status_hint(StatusCode::FORBIDDEN);
assert!(hint.contains("provider"));
assert!(hint.contains("brave"));
}
#[test]
fn duckduckgo_status_hint_for_500_is_empty() {
assert!(
WebSearchTool::duckduckgo_status_hint(StatusCode::INTERNAL_SERVER_ERROR).is_empty()
);
}
#[test]
fn test_constructor_clamps_web_search_limits() {
let tool = WebSearchTool::new(