import axios, { type AxiosResponse } from 'axios' import { LRUCache } from 'lru-cache' import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from '../../services/analytics/index.js' import { queryHaiku } from '../../services/api/claude.js' import { AbortError } from '../../utils/errors.js' import { getWebFetchUserAgent } from '../../utils/http.js' import { logError } from '../../utils/log.js' import { isBinaryContentType, persistBinaryContent, } from '../../utils/mcpOutputStorage.js' import { getSettings_DEPRECATED } from '../../utils/settings/settings.js' import { asSystemPrompt } from '../../utils/systemPromptType.js' import { isPreapprovedHost } from './preapproved.js' import { makeSecondaryModelPrompt } from './prompt.js' // Custom error classes for domain blocking class DomainBlockedError extends Error { constructor(domain: string) { super(`Claude Code is unable to fetch from ${domain}`) this.name = 'DomainBlockedError' } } class DomainCheckFailedError extends Error { constructor(domain: string) { super( `Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`, ) this.name = 'DomainCheckFailedError' } } class EgressBlockedError extends Error { constructor(public readonly domain: string) { super( JSON.stringify({ error_type: 'EGRESS_BLOCKED', domain, message: `Access to ${domain} is blocked by the network egress proxy.`, }), ) this.name = 'EgressBlockedError' } } // Cache for storing fetched URL content type CacheEntry = { bytes: number code: number codeText: string content: string contentType: string persistedPath?: string persistedSize?: number } // Cache with 15-minute TTL and 50MB size limit // LRUCache handles automatic expiration and eviction const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB const URL_CACHE = new LRUCache({ maxSize: MAX_CACHE_SIZE_BYTES, ttl: CACHE_TTL_MS, }) // Separate cache for preflight domain checks. URL_CACHE is URL-keyed, so // fetching two paths on the same domain triggers two identical preflight // HTTP round-trips to api.anthropic.com. This hostname-keyed cache avoids // that. Only 'allowed' is cached — blocked/failed re-check on next attempt. const DOMAIN_CHECK_CACHE = new LRUCache({ max: 128, ttl: 5 * 60 * 1000, // 5 minutes — shorter than URL_CACHE TTL }) export function clearWebFetchCache(): void { URL_CACHE.clear() DOMAIN_CHECK_CACHE.clear() } // Lazy singleton — defers the turndown → @mixmark-io/domino import (~1.4MB // retained heap) until the first HTML fetch, and reuses one instance across // calls (construction builds 15 rule objects; .turndown() is stateless). // @types/turndown ships only `export =` (no .d.mts), so TS types the import // as the class itself while Bun wraps CJS in { default } — hence the cast. type TurndownCtor = typeof import('turndown') let turndownServicePromise: Promise> | undefined function getTurndownService(): Promise> { return (turndownServicePromise ??= import('turndown').then(m => { const Turndown = (m as unknown as { default: TurndownCtor }).default return new Turndown() })) } // PSR requested limiting the length of URLs to 250 to lower the potential // for a data exfiltration. However, this is too restrictive for some customers' // legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs) // that can be much longer. We already require user approval for each domain, // which provides a primary security boundary. In addition, Claude Code has // other data exfil channels, and this one does not seem relatively high risk, // so I'm removing that length restriction. -ab const MAX_URL_LENGTH = 2000 // Per PSR: // "Implement resource consumption controls because setting limits on CPU, // memory, and network usage for the Web Fetch tool can prevent a single // request or user from overwhelming the system." const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024 // Timeout for the main HTTP fetch request (60 seconds). // Prevents hanging indefinitely on slow/unresponsive servers. const FETCH_TIMEOUT_MS = 60_000 // Timeout for the domain blocklist preflight check (10 seconds). const DOMAIN_CHECK_TIMEOUT_MS = 10_000 // Cap same-host redirect hops. Without this a malicious server can return // a redirect loop (/a → /b → /a …) and the per-request FETCH_TIMEOUT_MS // resets on every hop, hanging the tool until user interrupt. 10 matches // common client defaults (axios=5, follow-redirects=21, Chrome=20). const MAX_REDIRECTS = 10 // Truncate to not spend too many tokens export const MAX_MARKDOWN_LENGTH = 100_000 export function isPreapprovedUrl(url: string): boolean { try { const parsedUrl = new URL(url) return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname) } catch { return false } } export function validateURL(url: string): boolean { if (url.length > MAX_URL_LENGTH) { return false } let parsed try { parsed = new URL(url) } catch { return false } // We don't need to check protocol here, as we'll upgrade http to https when making the request // As long as we aren't supporting aiming to cookies or internal domains, // we should block URLs with usernames/passwords too, even though these // seem exceedingly unlikely. if (parsed.username || parsed.password) { return false } // Initial filter that this isn't a privileged, company-internal URL // by checking that the hostname is publicly resolvable const hostname = parsed.hostname const parts = hostname.split('.') if (parts.length < 2) { return false } return true } type DomainCheckResult = | { status: 'allowed' } | { status: 'blocked' } | { status: 'check_failed'; error: Error } export async function checkDomainBlocklist( domain: string, ): Promise { if (DOMAIN_CHECK_CACHE.has(domain)) { return { status: 'allowed' } } try { const response = await axios.get( `https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`, { timeout: DOMAIN_CHECK_TIMEOUT_MS }, ) if (response.status === 200) { if (response.data.can_fetch === true) { DOMAIN_CHECK_CACHE.set(domain, true) return { status: 'allowed' } } return { status: 'blocked' } } // Non-200 status but didn't throw return { status: 'check_failed', error: new Error(`Domain check returned status ${response.status}`), } } catch (e) { logError(e) return { status: 'check_failed', error: e as Error } } } /** * Check if a redirect is safe to follow * Allows redirects that: * - Add or remove "www." in the hostname * - Keep the origin the same but change path/query params * - Or both of the above */ export function isPermittedRedirect( originalUrl: string, redirectUrl: string, ): boolean { try { const parsedOriginal = new URL(originalUrl) const parsedRedirect = new URL(redirectUrl) if (parsedRedirect.protocol !== parsedOriginal.protocol) { return false } if (parsedRedirect.port !== parsedOriginal.port) { return false } if (parsedRedirect.username || parsedRedirect.password) { return false } // Now check hostname conditions // 1. Adding www. is allowed: example.com -> www.example.com // 2. Removing www. is allowed: www.example.com -> example.com // 3. Same host (with or without www.) is allowed: paths can change const stripWww = (hostname: string) => hostname.replace(/^www\./, '') const originalHostWithoutWww = stripWww(parsedOriginal.hostname) const redirectHostWithoutWww = stripWww(parsedRedirect.hostname) return originalHostWithoutWww === redirectHostWithoutWww } catch (_error) { return false } } /** * Helper function to handle fetching URLs with custom redirect handling * Recursively follows redirects if they pass the redirectChecker function * * Per PSR: * "Do not automatically follow redirects because following redirects could * allow for an attacker to exploit an open redirect vulnerability in a * trusted domain to force a user to make a request to a malicious domain * unknowingly" */ type RedirectInfo = { type: 'redirect' originalUrl: string redirectUrl: string statusCode: number } export async function getWithPermittedRedirects( url: string, signal: AbortSignal, redirectChecker: (originalUrl: string, redirectUrl: string) => boolean, depth = 0, ): Promise | RedirectInfo> { if (depth > MAX_REDIRECTS) { throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`) } try { return await axios.get(url, { signal, timeout: FETCH_TIMEOUT_MS, maxRedirects: 0, responseType: 'arraybuffer', maxContentLength: MAX_HTTP_CONTENT_LENGTH, headers: { Accept: 'text/markdown, text/html, */*', 'User-Agent': getWebFetchUserAgent(), }, }) } catch (error) { if ( axios.isAxiosError(error) && error.response && [301, 302, 307, 308].includes(error.response.status) ) { const redirectLocation = error.response.headers.location if (!redirectLocation) { throw new Error('Redirect missing Location header') } // Resolve relative URLs against the original URL const redirectUrl = new URL(redirectLocation, url).toString() if (redirectChecker(url, redirectUrl)) { // Recursively follow the permitted redirect return getWithPermittedRedirects( redirectUrl, signal, redirectChecker, depth + 1, ) } else { // Return redirect information to the caller return { type: 'redirect', originalUrl: url, redirectUrl, statusCode: error.response.status, } } } // Detect egress proxy blocks: the proxy returns 403 with // X-Proxy-Error: blocked-by-allowlist when egress is restricted if ( axios.isAxiosError(error) && error.response?.status === 403 && error.response.headers['x-proxy-error'] === 'blocked-by-allowlist' ) { const hostname = new URL(url).hostname throw new EgressBlockedError(hostname) } throw error } } function isRedirectInfo( response: AxiosResponse | RedirectInfo, ): response is RedirectInfo { return 'type' in response && response.type === 'redirect' } export type FetchedContent = { content: string bytes: number code: number codeText: string contentType: string persistedPath?: string persistedSize?: number } export async function getURLMarkdownContent( url: string, abortController: AbortController, ): Promise { if (!validateURL(url)) { throw new Error('Invalid URL') } // Check cache (LRUCache handles TTL automatically) const cachedEntry = URL_CACHE.get(url) if (cachedEntry) { return { bytes: cachedEntry.bytes, code: cachedEntry.code, codeText: cachedEntry.codeText, content: cachedEntry.content, contentType: cachedEntry.contentType, persistedPath: cachedEntry.persistedPath, persistedSize: cachedEntry.persistedSize, } } let parsedUrl: URL let upgradedUrl = url try { parsedUrl = new URL(url) // Upgrade http to https if needed if (parsedUrl.protocol === 'http:') { parsedUrl.protocol = 'https:' upgradedUrl = parsedUrl.toString() } const hostname = parsedUrl.hostname // Check if the user has opted to skip the blocklist check // This is for enterprise customers with restrictive security policies // that prevent outbound connections to claude.ai const settings = getSettings_DEPRECATED() if (!settings.skipWebFetchPreflight) { const checkResult = await checkDomainBlocklist(hostname) switch (checkResult.status) { case 'allowed': // Continue with the fetch break case 'blocked': throw new DomainBlockedError(hostname) case 'check_failed': throw new DomainCheckFailedError(hostname) } } if (process.env.USER_TYPE === 'ant') { logEvent('tengu_web_fetch_host', { hostname: hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }) } } catch (e) { if ( e instanceof DomainBlockedError || e instanceof DomainCheckFailedError ) { // Expected user-facing failures - re-throw without logging as internal error throw e } logError(e) } const response = await getWithPermittedRedirects( upgradedUrl, abortController.signal, isPermittedRedirect, ) // Check if we got a redirect response if (isRedirectInfo(response)) { return response } const rawBuffer = Buffer.from(response.data) // Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now. // This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown // builds its DOM tree (which can be 3-5x the HTML size). ;(response as { data: unknown }).data = null const contentType = response.headers['content-type'] ?? '' // Binary content: save raw bytes to disk with a proper extension so Claude // can inspect the file later. We still fall through to the utf-8 decode + // Haiku path below — for PDFs in particular the decoded string has enough // ASCII structure (/Title, text streams) that Haiku can summarize it, and // the saved file is a supplement rather than a replacement. let persistedPath: string | undefined let persistedSize: number | undefined if (isBinaryContentType(contentType)) { const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}` const result = await persistBinaryContent(rawBuffer, contentType, persistId) if (!('error' in result)) { persistedPath = result.filepath persistedSize = result.size } } const bytes = rawBuffer.length const htmlContent = rawBuffer.toString('utf-8') let markdownContent: string let contentBytes: number if (contentType.includes('text/html')) { markdownContent = (await getTurndownService()).turndown(htmlContent) contentBytes = Buffer.byteLength(markdownContent) } else { // It's not HTML - just use it raw. The decoded string's UTF-8 byte // length equals rawBuffer.length (modulo U+FFFD replacement on invalid // bytes — negligible for cache eviction accounting), so skip the O(n) // Buffer.byteLength scan. markdownContent = htmlContent contentBytes = bytes } // Store the fetched content in cache. Note that it's stored under // the original URL, not the upgraded or redirected URL. const entry: CacheEntry = { bytes, code: response.status, codeText: response.statusText, content: markdownContent, contentType, persistedPath, persistedSize, } // lru-cache requires positive integers; clamp to 1 for empty responses. URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) }) return entry } export async function applyPromptToMarkdown( prompt: string, markdownContent: string, signal: AbortSignal, isNonInteractiveSession: boolean, isPreapprovedDomain: boolean, ): Promise { // Truncate content to avoid "Prompt is too long" errors from the secondary model const truncatedContent = markdownContent.length > MAX_MARKDOWN_LENGTH ? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) + '\n\n[Content truncated due to length...]' : markdownContent const modelPrompt = makeSecondaryModelPrompt( truncatedContent, prompt, isPreapprovedDomain, ) const assistantMessage = await queryHaiku({ systemPrompt: asSystemPrompt([]), userPrompt: modelPrompt, signal, options: { querySource: 'web_fetch_apply', agents: [], isNonInteractiveSession, hasAppendSystemPrompt: false, mcpTools: [], }, }) // We need to bubble this up, so that the tool call throws, causing us to return // an is_error tool_use block to the server, and render a red dot in the UI. if (signal.aborted) { throw new AbortError() } const { content } = assistantMessage.message if (content.length > 0) { const contentBlock = content[0] if ('text' in contentBlock!) { return contentBlock.text } } return 'No response from model' }