// biome-ignore-all assist/source/organizeImports: ANT-ONLY import markers must not be reordered import { CONTEXT_1M_BETA_HEADER } from '../constants/betas.js' import { getGlobalConfig } from './config.js' import { isEnvTruthy } from './envUtils.js' import { getCanonicalName } from './model/model.js' import { getModelCapability } from './model/modelCapabilities.js' // Model context window size (200k tokens for all models right now) export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000 // Maximum output tokens for compact operations export const COMPACT_MAX_OUTPUT_TOKENS = 20_000 // Default max output tokens const MAX_OUTPUT_TOKENS_DEFAULT = 32_000 const MAX_OUTPUT_TOKENS_UPPER_LIMIT = 64_000 // Capped default for slot-reservation optimization. BQ p99 output = 4,911 // tokens, so 32k/64k defaults over-reserve 8-16× slot capacity. With the cap // enabled, <1% of requests hit the limit; those get one clean retry at 64k // (see query.ts max_output_tokens_escalate). Cap is applied in // claude.ts:getMaxOutputTokensForModel to avoid the growthbook→betas→context // import cycle. export const CAPPED_DEFAULT_MAX_TOKENS = 8_000 export const ESCALATED_MAX_TOKENS = 64_000 /** * Check if 1M context is disabled via environment variable. * Used by C4E admins to disable 1M context for HIPAA compliance. */ export function is1mContextDisabled(): boolean { return isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_1M_CONTEXT) } export function has1mContext(model: string): boolean { if (is1mContextDisabled()) { return false } return /\[1m\]/i.test(model) } // @[MODEL LAUNCH]: Update this pattern if the new model supports 1M context export function modelSupports1M(model: string): boolean { if (is1mContextDisabled()) { return false } const canonical = getCanonicalName(model) return canonical.includes('claude-sonnet-4') || canonical.includes('opus-4-6') } export function getContextWindowForModel( model: string, betas?: string[], ): number { // Allow override via environment variable (ant-only) // This takes precedence over all other context window resolution, including 1M detection, // so users can cap the effective context window for local decisions (auto-compact, etc.) // while still using a 1M-capable endpoint. if ( process.env.USER_TYPE === 'ant' && process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS ) { const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10) if (!isNaN(override) && override > 0) { return override } } // [1m] suffix — explicit client-side opt-in, respected over all detection if (has1mContext(model)) { return 1_000_000 } const cap = getModelCapability(model) if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) { if ( cap.max_input_tokens > MODEL_CONTEXT_WINDOW_DEFAULT && is1mContextDisabled() ) { return MODEL_CONTEXT_WINDOW_DEFAULT } return cap.max_input_tokens } if (betas?.includes(CONTEXT_1M_BETA_HEADER) && modelSupports1M(model)) { return 1_000_000 } if (getSonnet1mExpTreatmentEnabled(model)) { return 1_000_000 } if (process.env.USER_TYPE === 'ant') { const antModel = resolveAntModel(model) if (antModel?.contextWindow) { return antModel.contextWindow } } return MODEL_CONTEXT_WINDOW_DEFAULT } export function getSonnet1mExpTreatmentEnabled(model: string): boolean { if (is1mContextDisabled()) { return false } // Only applies to sonnet 4.6 without an explicit [1m] suffix if (has1mContext(model)) { return false } if (!getCanonicalName(model).includes('sonnet-4-6')) { return false } return getGlobalConfig().clientDataCache?.['coral_reef_sonnet'] === 'true' } /** * Calculate context window usage percentage from token usage data. * Returns used and remaining percentages, or null values if no usage data. */ export function calculateContextPercentages( currentUsage: { input_tokens: number cache_creation_input_tokens: number cache_read_input_tokens: number } | null, contextWindowSize: number, ): { used: number | null; remaining: number | null } { if (!currentUsage) { return { used: null, remaining: null } } const totalInputTokens = currentUsage.input_tokens + currentUsage.cache_creation_input_tokens + currentUsage.cache_read_input_tokens const usedPercentage = Math.round( (totalInputTokens / contextWindowSize) * 100, ) const clampedUsed = Math.min(100, Math.max(0, usedPercentage)) return { used: clampedUsed, remaining: 100 - clampedUsed, } } /** * Returns the model's default and upper limit for max output tokens. */ export function getModelMaxOutputTokens(model: string): { default: number upperLimit: number } { let defaultTokens: number let upperLimit: number if (process.env.USER_TYPE === 'ant') { const antModel = resolveAntModel(model.toLowerCase()) if (antModel) { defaultTokens = antModel.defaultMaxTokens ?? MAX_OUTPUT_TOKENS_DEFAULT upperLimit = antModel.upperMaxTokensLimit ?? MAX_OUTPUT_TOKENS_UPPER_LIMIT return { default: defaultTokens, upperLimit } } } const m = getCanonicalName(model) if (m.includes('opus-4-6')) { defaultTokens = 64_000 upperLimit = 128_000 } else if (m.includes('sonnet-4-6')) { defaultTokens = 32_000 upperLimit = 128_000 } else if ( m.includes('opus-4-5') || m.includes('sonnet-4') || m.includes('haiku-4') ) { defaultTokens = 32_000 upperLimit = 64_000 } else if (m.includes('opus-4-1') || m.includes('opus-4')) { defaultTokens = 32_000 upperLimit = 32_000 } else if (m.includes('claude-3-opus')) { defaultTokens = 4_096 upperLimit = 4_096 } else if (m.includes('claude-3-sonnet')) { defaultTokens = 8_192 upperLimit = 8_192 } else if (m.includes('claude-3-haiku')) { defaultTokens = 4_096 upperLimit = 4_096 } else if (m.includes('3-5-sonnet') || m.includes('3-5-haiku')) { defaultTokens = 8_192 upperLimit = 8_192 } else if (m.includes('3-7-sonnet')) { defaultTokens = 32_000 upperLimit = 64_000 } else { defaultTokens = MAX_OUTPUT_TOKENS_DEFAULT upperLimit = MAX_OUTPUT_TOKENS_UPPER_LIMIT } const cap = getModelCapability(model) if (cap?.max_tokens && cap.max_tokens >= 4_096) { upperLimit = cap.max_tokens defaultTokens = Math.min(defaultTokens, upperLimit) } return { default: defaultTokens, upperLimit } } /** * Returns the max thinking budget tokens for a given model. The max * thinking tokens should be strictly less than the max output tokens. * * Deprecated since newer models use adaptive thinking rather than a * strict thinking token budget. */ export function getMaxThinkingTokensForModel(model: string): number { return getModelMaxOutputTokens(model).upperLimit - 1 }