import { feature } from 'bun:bundle' import type Anthropic from '@anthropic-ai/sdk' import { APIConnectionError, APIError, APIUserAbortError, } from '@anthropic-ai/sdk' import type { QuerySource } from 'src/constants/querySource.js' import type { SystemAPIErrorMessage } from 'src/types/message.js' import { isAwsCredentialsProviderError } from 'src/utils/aws.js' import { logForDebugging } from 'src/utils/debug.js' import { logError } from 'src/utils/log.js' import { createSystemAPIErrorMessage } from 'src/utils/messages.js' import { getAPIProviderForStatsig } from 'src/utils/model/providers.js' import { clearApiKeyHelperCache, clearAwsCredentialsCache, clearGcpCredentialsCache, getClaudeAIOAuthTokens, handleOAuth401Error, isClaudeAISubscriber, isEnterpriseSubscriber, } from '../../utils/auth.js' import { isEnvTruthy } from '../../utils/envUtils.js' import { errorMessage } from '../../utils/errors.js' import { type CooldownReason, handleFastModeOverageRejection, handleFastModeRejectedByAPI, isFastModeCooldown, isFastModeEnabled, triggerFastModeCooldown, } from '../../utils/fastMode.js' import { isNonCustomOpusModel } from '../../utils/model/model.js' import { disableKeepAlive } from '../../utils/proxy.js' import { sleep } from '../../utils/sleep.js' import type { ThinkingConfig } from '../../utils/thinking.js' import { getFeatureValue_CACHED_MAY_BE_STALE } from '../analytics/growthbook.js' import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from '../analytics/index.js' import { checkMockRateLimitError, isMockRateLimitError, } from '../rateLimitMocking.js' import { REPEATED_529_ERROR_MESSAGE } from './errors.js' import { extractConnectionErrorDetails } from './errorUtils.js' const abortError = () => new APIUserAbortError() const DEFAULT_MAX_RETRIES = 10 const FLOOR_OUTPUT_TOKENS = 3000 const MAX_529_RETRIES = 3 export const BASE_DELAY_MS = 500 // Foreground query sources where the user IS blocking on the result — these // retry on 529. Everything else (summaries, titles, suggestions, classifiers) // bails immediately: during a capacity cascade each retry is 3-10× gateway // amplification, and the user never sees those fail anyway. New sources // default to no-retry — add here only if the user is waiting on the result. const FOREGROUND_529_RETRY_SOURCES = new Set([ 'repl_main_thread', 'repl_main_thread:outputStyle:custom', 'repl_main_thread:outputStyle:Explanatory', 'repl_main_thread:outputStyle:Learning', 'sdk', 'agent:custom', 'agent:default', 'agent:builtin', 'compact', 'hook_agent', 'hook_prompt', 'verification_agent', 'side_question', // Security classifiers — must complete for auto-mode correctness. // yoloClassifier.ts uses 'auto_mode' (not 'yolo_classifier' — that's // type-only). bash_classifier is ant-only; feature-gate so the string // tree-shakes out of external builds (excluded-strings.txt). 'auto_mode', ...(feature('BASH_CLASSIFIER') ? (['bash_classifier'] as const) : []), ]) function shouldRetry529(querySource: QuerySource | undefined): boolean { // undefined → retry (conservative for untagged call paths) return ( querySource === undefined || FOREGROUND_529_RETRY_SOURCES.has(querySource) ) } // CLAUDE_CODE_UNATTENDED_RETRY: for unattended sessions (ant-only). Retries 429/529 // indefinitely with higher backoff and periodic keep-alive yields so the host // environment does not mark the session idle mid-wait. // TODO(ANT-344): the keep-alive via SystemAPIErrorMessage yields is a stopgap // until there's a dedicated keep-alive channel. const PERSISTENT_MAX_BACKOFF_MS = 5 * 60 * 1000 const PERSISTENT_RESET_CAP_MS = 6 * 60 * 60 * 1000 const HEARTBEAT_INTERVAL_MS = 30_000 function isPersistentRetryEnabled(): boolean { return feature('UNATTENDED_RETRY') ? isEnvTruthy(process.env.CLAUDE_CODE_UNATTENDED_RETRY) : false } function isTransientCapacityError(error: unknown): boolean { return ( is529Error(error) || (error instanceof APIError && error.status === 429) ) } function isStaleConnectionError(error: unknown): boolean { if (!(error instanceof APIConnectionError)) { return false } const details = extractConnectionErrorDetails(error) return details?.code === 'ECONNRESET' || details?.code === 'EPIPE' } export interface RetryContext { maxTokensOverride?: number model: string thinkingConfig: ThinkingConfig fastMode?: boolean } interface RetryOptions { maxRetries?: number model: string fallbackModel?: string thinkingConfig: ThinkingConfig fastMode?: boolean signal?: AbortSignal querySource?: QuerySource /** * Pre-seed the consecutive 529 counter. Used when this retry loop is a * non-streaming fallback after a streaming 529 — the streaming 529 should * count toward MAX_529_RETRIES so total 529s-before-fallback is consistent * regardless of which request mode hit the overload. */ initialConsecutive529Errors?: number } export class CannotRetryError extends Error { constructor( public readonly originalError: unknown, public readonly retryContext: RetryContext, ) { const message = errorMessage(originalError) super(message) this.name = 'RetryError' // Preserve the original stack trace if available if (originalError instanceof Error && originalError.stack) { this.stack = originalError.stack } } } export class FallbackTriggeredError extends Error { constructor( public readonly originalModel: string, public readonly fallbackModel: string, ) { super(`Model fallback triggered: ${originalModel} -> ${fallbackModel}`) this.name = 'FallbackTriggeredError' } } export async function* withRetry( getClient: () => Promise, operation: ( client: Anthropic, attempt: number, context: RetryContext, ) => Promise, options: RetryOptions, ): AsyncGenerator { const maxRetries = getMaxRetries(options) const retryContext: RetryContext = { model: options.model, thinkingConfig: options.thinkingConfig, ...(isFastModeEnabled() && { fastMode: options.fastMode }), } let client: Anthropic | null = null let consecutive529Errors = options.initialConsecutive529Errors ?? 0 let lastError: unknown let persistentAttempt = 0 for (let attempt = 1; attempt <= maxRetries + 1; attempt++) { if (options.signal?.aborted) { throw new APIUserAbortError() } // Capture whether fast mode is active before this attempt // (fallback may change the state mid-loop) const wasFastModeActive = isFastModeEnabled() ? retryContext.fastMode && !isFastModeCooldown() : false try { // Check for mock rate limits (used by /mock-limits command for Ant employees) if (process.env.USER_TYPE === 'ant') { const mockError = checkMockRateLimitError( retryContext.model, wasFastModeActive, ) if (mockError) { throw mockError } } // Get a fresh client instance on first attempt or after authentication errors // - 401 for first-party API authentication failures // - 403 "OAuth token has been revoked" (another process refreshed the token) // - Bedrock-specific auth errors (403 or CredentialsProviderError) // - Vertex-specific auth errors (credential refresh failures, 401) // - ECONNRESET/EPIPE: stale keep-alive socket; disable pooling and reconnect const isStaleConnection = isStaleConnectionError(lastError) if ( isStaleConnection && getFeatureValue_CACHED_MAY_BE_STALE( 'tengu_disable_keepalive_on_econnreset', false, ) ) { logForDebugging( 'Stale connection (ECONNRESET/EPIPE) — disabling keep-alive for retry', ) disableKeepAlive() } if ( client === null || (lastError instanceof APIError && lastError.status === 401) || isOAuthTokenRevokedError(lastError) || isBedrockAuthError(lastError) || isVertexAuthError(lastError) || isStaleConnection ) { // On 401 "token expired" or 403 "token revoked", force a token refresh if ( (lastError instanceof APIError && lastError.status === 401) || isOAuthTokenRevokedError(lastError) ) { const failedAccessToken = getClaudeAIOAuthTokens()?.accessToken if (failedAccessToken) { await handleOAuth401Error(failedAccessToken) } } client = await getClient() } return await operation(client, attempt, retryContext) } catch (error) { lastError = error logForDebugging( `API error (attempt ${attempt}/${maxRetries + 1}): ${error instanceof APIError ? `${error.status} ${error.message}` : errorMessage(error)}`, { level: 'error' }, ) // Fast mode fallback: on 429/529, either wait and retry (short delays) // or fall back to standard speed (long delays) to avoid cache thrashing. // Skip in persistent mode: the short-retry path below loops with fast // mode still active, so its `continue` never reaches the attempt clamp // and the for-loop terminates. Persistent sessions want the chunked // keep-alive path instead of fast-mode cache-preservation anyway. if ( wasFastModeActive && !isPersistentRetryEnabled() && error instanceof APIError && (error.status === 429 || is529Error(error)) ) { // If the 429 is specifically because extra usage (overage) is not // available, permanently disable fast mode with a specific message. const overageReason = error.headers?.get( 'anthropic-ratelimit-unified-overage-disabled-reason', ) if (overageReason !== null && overageReason !== undefined) { handleFastModeOverageRejection(overageReason) retryContext.fastMode = false continue } const retryAfterMs = getRetryAfterMs(error) if (retryAfterMs !== null && retryAfterMs < SHORT_RETRY_THRESHOLD_MS) { // Short retry-after: wait and retry with fast mode still active // to preserve prompt cache (same model name on retry). await sleep(retryAfterMs, options.signal, { abortError }) continue } // Long or unknown retry-after: enter cooldown (switches to standard // speed model), with a minimum floor to avoid flip-flopping. const cooldownMs = Math.max( retryAfterMs ?? DEFAULT_FAST_MODE_FALLBACK_HOLD_MS, MIN_COOLDOWN_MS, ) const cooldownReason: CooldownReason = is529Error(error) ? 'overloaded' : 'rate_limit' triggerFastModeCooldown(Date.now() + cooldownMs, cooldownReason) if (isFastModeEnabled()) { retryContext.fastMode = false } continue } // Fast mode fallback: if the API rejects the fast mode parameter // (e.g., org doesn't have fast mode enabled), permanently disable fast // mode and retry at standard speed. if (wasFastModeActive && isFastModeNotEnabledError(error)) { handleFastModeRejectedByAPI() retryContext.fastMode = false continue } // Non-foreground sources bail immediately on 529 — no retry amplification // during capacity cascades. User never sees these fail. if (is529Error(error) && !shouldRetry529(options.querySource)) { logEvent('tengu_api_529_background_dropped', { query_source: options.querySource as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }) throw new CannotRetryError(error, retryContext) } // Track consecutive 529 errors if ( is529Error(error) && // If FALLBACK_FOR_ALL_PRIMARY_MODELS is not set, fall through only if the primary model is a non-custom Opus model. // TODO: Revisit if the isNonCustomOpusModel check should still exist, or if isNonCustomOpusModel is a stale artifact of when Claude Code was hardcoded on Opus. (process.env.FALLBACK_FOR_ALL_PRIMARY_MODELS || (!isClaudeAISubscriber() && isNonCustomOpusModel(options.model))) ) { consecutive529Errors++ if (consecutive529Errors >= MAX_529_RETRIES) { // Check if fallback model is specified if (options.fallbackModel) { logEvent('tengu_api_opus_fallback_triggered', { original_model: options.model as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, fallback_model: options.fallbackModel as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, provider: getAPIProviderForStatsig(), }) // Throw special error to indicate fallback was triggered throw new FallbackTriggeredError( options.model, options.fallbackModel, ) } if ( process.env.USER_TYPE === 'external' && !process.env.IS_SANDBOX && !isPersistentRetryEnabled() ) { logEvent('tengu_api_custom_529_overloaded_error', {}) throw new CannotRetryError( new Error(REPEATED_529_ERROR_MESSAGE), retryContext, ) } } } // Only retry if the error indicates we should const persistent = isPersistentRetryEnabled() && isTransientCapacityError(error) if (attempt > maxRetries && !persistent) { throw new CannotRetryError(error, retryContext) } // AWS/GCP errors aren't always APIError, but can be retried const handledCloudAuthError = handleAwsCredentialError(error) || handleGcpCredentialError(error) if ( !handledCloudAuthError && (!(error instanceof APIError) || !shouldRetry(error)) ) { throw new CannotRetryError(error, retryContext) } // Handle max tokens context overflow errors by adjusting max_tokens for the next attempt // NOTE: With extended-context-window beta, this 400 error should not occur. // The API now returns 'model_context_window_exceeded' stop_reason instead. // Keeping for backward compatibility. if (error instanceof APIError) { const overflowData = parseMaxTokensContextOverflowError(error) if (overflowData) { const { inputTokens, contextLimit } = overflowData const safetyBuffer = 1000 const availableContext = Math.max( 0, contextLimit - inputTokens - safetyBuffer, ) if (availableContext < FLOOR_OUTPUT_TOKENS) { logError( new Error( `availableContext ${availableContext} is less than FLOOR_OUTPUT_TOKENS ${FLOOR_OUTPUT_TOKENS}`, ), ) throw error } // Ensure we have enough tokens for thinking + at least 1 output token const minRequired = (retryContext.thinkingConfig.type === 'enabled' ? retryContext.thinkingConfig.budgetTokens : 0) + 1 const adjustedMaxTokens = Math.max( FLOOR_OUTPUT_TOKENS, availableContext, minRequired, ) retryContext.maxTokensOverride = adjustedMaxTokens logEvent('tengu_max_tokens_context_overflow_adjustment', { inputTokens, contextLimit, adjustedMaxTokens, attempt, }) continue } } // For other errors, proceed with normal retry logic // Get retry-after header if available const retryAfter = getRetryAfter(error) let delayMs: number if (persistent && error instanceof APIError && error.status === 429) { persistentAttempt++ // Window-based limits (e.g. 5hr Max/Pro) include a reset timestamp. // Wait until reset rather than polling every 5 min uselessly. const resetDelay = getRateLimitResetDelayMs(error) delayMs = resetDelay ?? Math.min( getRetryDelay( persistentAttempt, retryAfter, PERSISTENT_MAX_BACKOFF_MS, ), PERSISTENT_RESET_CAP_MS, ) } else if (persistent) { persistentAttempt++ // Retry-After is a server directive and bypasses maxDelayMs inside // getRetryDelay (intentional — honoring it is correct). Cap at the // 6hr reset-cap here so a pathological header can't wait unbounded. delayMs = Math.min( getRetryDelay( persistentAttempt, retryAfter, PERSISTENT_MAX_BACKOFF_MS, ), PERSISTENT_RESET_CAP_MS, ) } else { delayMs = getRetryDelay(attempt, retryAfter) } // In persistent mode the for-loop `attempt` is clamped at maxRetries+1; // use persistentAttempt for telemetry/yields so they show the true count. const reportedAttempt = persistent ? persistentAttempt : attempt logEvent('tengu_api_retry', { attempt: reportedAttempt, delayMs: delayMs, error: (error as APIError) .message as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, status: (error as APIError).status, provider: getAPIProviderForStatsig(), }) if (persistent) { if (delayMs > 60_000) { logEvent('tengu_api_persistent_retry_wait', { status: (error as APIError).status, delayMs, attempt: reportedAttempt, provider: getAPIProviderForStatsig(), }) } // Chunk long sleeps so the host sees periodic stdout activity and // does not mark the session idle. Each yield surfaces as // {type:'system', subtype:'api_retry'} on stdout via QueryEngine. let remaining = delayMs while (remaining > 0) { if (options.signal?.aborted) throw new APIUserAbortError() if (error instanceof APIError) { yield createSystemAPIErrorMessage( error, remaining, reportedAttempt, maxRetries, ) } const chunk = Math.min(remaining, HEARTBEAT_INTERVAL_MS) await sleep(chunk, options.signal, { abortError }) remaining -= chunk } // Clamp so the for-loop never terminates. Backoff uses the separate // persistentAttempt counter which keeps growing to the 5-min cap. if (attempt >= maxRetries) attempt = maxRetries } else { if (error instanceof APIError) { yield createSystemAPIErrorMessage(error, delayMs, attempt, maxRetries) } await sleep(delayMs, options.signal, { abortError }) } } } throw new CannotRetryError(lastError, retryContext) } function getRetryAfter(error: unknown): string | null { return ( ((error as { headers?: { 'retry-after'?: string } }).headers?.[ 'retry-after' ] || // eslint-disable-next-line eslint-plugin-n/no-unsupported-features/node-builtins ((error as APIError).headers as Headers)?.get?.('retry-after')) ?? null ) } export function getRetryDelay( attempt: number, retryAfterHeader?: string | null, maxDelayMs = 32000, ): number { if (retryAfterHeader) { const seconds = parseInt(retryAfterHeader, 10) if (!isNaN(seconds)) { return seconds * 1000 } } const baseDelay = Math.min( BASE_DELAY_MS * Math.pow(2, attempt - 1), maxDelayMs, ) const jitter = Math.random() * 0.25 * baseDelay return baseDelay + jitter } export function parseMaxTokensContextOverflowError(error: APIError): | { inputTokens: number maxTokens: number contextLimit: number } | undefined { if (error.status !== 400 || !error.message) { return undefined } if ( !error.message.includes( 'input length and `max_tokens` exceed context limit', ) ) { return undefined } // Example format: "input length and `max_tokens` exceed context limit: 188059 + 20000 > 200000" const regex = /input length and `max_tokens` exceed context limit: (\d+) \+ (\d+) > (\d+)/ const match = error.message.match(regex) if (!match || match.length !== 4) { return undefined } if (!match[1] || !match[2] || !match[3]) { logError( new Error( 'Unable to parse max_tokens from max_tokens exceed context limit error message', ), ) return undefined } const inputTokens = parseInt(match[1], 10) const maxTokens = parseInt(match[2], 10) const contextLimit = parseInt(match[3], 10) if (isNaN(inputTokens) || isNaN(maxTokens) || isNaN(contextLimit)) { return undefined } return { inputTokens, maxTokens, contextLimit } } // TODO: Replace with a response header check once the API adds a dedicated // header for fast-mode rejection (e.g., x-fast-mode-rejected). String-matching // the error message is fragile and will break if the API wording changes. function isFastModeNotEnabledError(error: unknown): boolean { if (!(error instanceof APIError)) { return false } return ( error.status === 400 && (error.message?.includes('Fast mode is not enabled') ?? false) ) } export function is529Error(error: unknown): boolean { if (!(error instanceof APIError)) { return false } // Check for 529 status code or overloaded error in message return ( error.status === 529 || // See below: the SDK sometimes fails to properly pass the 529 status code during streaming (error.message?.includes('"type":"overloaded_error"') ?? false) ) } function isOAuthTokenRevokedError(error: unknown): boolean { return ( error instanceof APIError && error.status === 403 && (error.message?.includes('OAuth token has been revoked') ?? false) ) } function isBedrockAuthError(error: unknown): boolean { if (isEnvTruthy(process.env.CLAUDE_CODE_USE_BEDROCK)) { // AWS libs reject without an API call if .aws holds a past Expiration value // otherwise, API calls that receive expired tokens give generic 403 // "The security token included in the request is invalid" if ( isAwsCredentialsProviderError(error) || (error instanceof APIError && error.status === 403) ) { return true } } return false } /** * Clear AWS auth caches if appropriate. * @returns true if action was taken. */ function handleAwsCredentialError(error: unknown): boolean { if (isBedrockAuthError(error)) { clearAwsCredentialsCache() return true } return false } // google-auth-library throws plain Error (no typed name like AWS's // CredentialsProviderError). Match common SDK-level credential-failure messages. function isGoogleAuthLibraryCredentialError(error: unknown): boolean { if (!(error instanceof Error)) return false const msg = error.message return ( msg.includes('Could not load the default credentials') || msg.includes('Could not refresh access token') || msg.includes('invalid_grant') ) } function isVertexAuthError(error: unknown): boolean { if (isEnvTruthy(process.env.CLAUDE_CODE_USE_VERTEX)) { // SDK-level: google-auth-library fails in prepareOptions() before the HTTP call if (isGoogleAuthLibraryCredentialError(error)) { return true } // Server-side: Vertex returns 401 for expired/invalid tokens if (error instanceof APIError && error.status === 401) { return true } } return false } /** * Clear GCP auth caches if appropriate. * @returns true if action was taken. */ function handleGcpCredentialError(error: unknown): boolean { if (isVertexAuthError(error)) { clearGcpCredentialsCache() return true } return false } function shouldRetry(error: APIError): boolean { // Never retry mock errors - they're from /mock-limits command for testing if (isMockRateLimitError(error)) { return false } // Persistent mode: 429/529 always retryable, bypass subscriber gates and // x-should-retry header. if (isPersistentRetryEnabled() && isTransientCapacityError(error)) { return true } // CCR mode: auth is via infrastructure-provided JWTs, so a 401/403 is a // transient blip (auth service flap, network hiccup) rather than bad // credentials. Bypass x-should-retry:false — the server assumes we'd retry // the same bad key, but our key is fine. if ( isEnvTruthy(process.env.CLAUDE_CODE_REMOTE) && (error.status === 401 || error.status === 403) ) { return true } // Check for overloaded errors first by examining the message content // The SDK sometimes fails to properly pass the 529 status code during streaming, // so we need to check the error message directly if (error.message?.includes('"type":"overloaded_error"')) { return true } // Check for max tokens context overflow errors that we can handle if (parseMaxTokensContextOverflowError(error)) { return true } // Note this is not a standard header. const shouldRetryHeader = error.headers?.get('x-should-retry') // If the server explicitly says whether or not to retry, obey. // For Max and Pro users, should-retry is true, but in several hours, so we shouldn't. // Enterprise users can retry because they typically use PAYG instead of rate limits. if ( shouldRetryHeader === 'true' && (!isClaudeAISubscriber() || isEnterpriseSubscriber()) ) { return true } // Ants can ignore x-should-retry: false for 5xx server errors only. // For other status codes (401, 403, 400, 429, etc.), respect the header. if (shouldRetryHeader === 'false') { const is5xxError = error.status !== undefined && error.status >= 500 if (!(process.env.USER_TYPE === 'ant' && is5xxError)) { return false } } if (error instanceof APIConnectionError) { return true } if (!error.status) return false // Retry on request timeouts. if (error.status === 408) return true // Retry on lock timeouts. if (error.status === 409) return true // Retry on rate limits, but not for ClaudeAI Subscription users // Enterprise users can retry because they typically use PAYG instead of rate limits if (error.status === 429) { return !isClaudeAISubscriber() || isEnterpriseSubscriber() } // Clear API key cache on 401 and allow retry. // OAuth token handling is done in the main retry loop via handleOAuth401Error. if (error.status === 401) { clearApiKeyHelperCache() return true } // Retry on 403 "token revoked" (same refresh logic as 401, see above) if (isOAuthTokenRevokedError(error)) { return true } // Retry internal errors. if (error.status && error.status >= 500) return true return false } export function getDefaultMaxRetries(): number { if (process.env.CLAUDE_CODE_MAX_RETRIES) { return parseInt(process.env.CLAUDE_CODE_MAX_RETRIES, 10) } return DEFAULT_MAX_RETRIES } function getMaxRetries(options: RetryOptions): number { return options.maxRetries ?? getDefaultMaxRetries() } const DEFAULT_FAST_MODE_FALLBACK_HOLD_MS = 30 * 60 * 1000 // 30 minutes const SHORT_RETRY_THRESHOLD_MS = 20 * 1000 // 20 seconds const MIN_COOLDOWN_MS = 10 * 60 * 1000 // 10 minutes function getRetryAfterMs(error: APIError): number | null { const retryAfter = getRetryAfter(error) if (retryAfter) { const seconds = parseInt(retryAfter, 10) if (!isNaN(seconds)) { return seconds * 1000 } } return null } function getRateLimitResetDelayMs(error: APIError): number | null { const resetHeader = error.headers?.get?.('anthropic-ratelimit-unified-reset') if (!resetHeader) return null const resetUnixSec = Number(resetHeader) if (!Number.isFinite(resetUnixSec)) return null const delayMs = resetUnixSec * 1000 - Date.now() if (delayMs <= 0) return null return Math.min(delayMs, PERSISTENT_RESET_CAP_MS) }