mono/packages/kbot/ref/utils/context.ts
2026-04-01 01:05:48 +02:00

222 lines
6.7 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// biome-ignore-all assist/source/organizeImports: ANT-ONLY import markers must not be reordered
import { CONTEXT_1M_BETA_HEADER } from '../constants/betas.js'
import { getGlobalConfig } from './config.js'
import { isEnvTruthy } from './envUtils.js'
import { getCanonicalName } from './model/model.js'
import { getModelCapability } from './model/modelCapabilities.js'
// Model context window size (200k tokens for all models right now)
export const MODEL_CONTEXT_WINDOW_DEFAULT = 200_000
// Maximum output tokens for compact operations
export const COMPACT_MAX_OUTPUT_TOKENS = 20_000
// Default max output tokens
const MAX_OUTPUT_TOKENS_DEFAULT = 32_000
const MAX_OUTPUT_TOKENS_UPPER_LIMIT = 64_000
// Capped default for slot-reservation optimization. BQ p99 output = 4,911
// tokens, so 32k/64k defaults over-reserve 8-16× slot capacity. With the cap
// enabled, <1% of requests hit the limit; those get one clean retry at 64k
// (see query.ts max_output_tokens_escalate). Cap is applied in
// claude.ts:getMaxOutputTokensForModel to avoid the growthbook→betas→context
// import cycle.
export const CAPPED_DEFAULT_MAX_TOKENS = 8_000
export const ESCALATED_MAX_TOKENS = 64_000
/**
* Check if 1M context is disabled via environment variable.
* Used by C4E admins to disable 1M context for HIPAA compliance.
*/
export function is1mContextDisabled(): boolean {
return isEnvTruthy(process.env.CLAUDE_CODE_DISABLE_1M_CONTEXT)
}
export function has1mContext(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
return /\[1m\]/i.test(model)
}
// @[MODEL LAUNCH]: Update this pattern if the new model supports 1M context
export function modelSupports1M(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
const canonical = getCanonicalName(model)
return canonical.includes('claude-sonnet-4') || canonical.includes('opus-4-6')
}
export function getContextWindowForModel(
model: string,
betas?: string[],
): number {
// Allow override via environment variable (ant-only)
// This takes precedence over all other context window resolution, including 1M detection,
// so users can cap the effective context window for local decisions (auto-compact, etc.)
// while still using a 1M-capable endpoint.
if (
process.env.USER_TYPE === 'ant' &&
process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS
) {
const override = parseInt(process.env.CLAUDE_CODE_MAX_CONTEXT_TOKENS, 10)
if (!isNaN(override) && override > 0) {
return override
}
}
// [1m] suffix — explicit client-side opt-in, respected over all detection
if (has1mContext(model)) {
return 1_000_000
}
const cap = getModelCapability(model)
if (cap?.max_input_tokens && cap.max_input_tokens >= 100_000) {
if (
cap.max_input_tokens > MODEL_CONTEXT_WINDOW_DEFAULT &&
is1mContextDisabled()
) {
return MODEL_CONTEXT_WINDOW_DEFAULT
}
return cap.max_input_tokens
}
if (betas?.includes(CONTEXT_1M_BETA_HEADER) && modelSupports1M(model)) {
return 1_000_000
}
if (getSonnet1mExpTreatmentEnabled(model)) {
return 1_000_000
}
if (process.env.USER_TYPE === 'ant') {
const antModel = resolveAntModel(model)
if (antModel?.contextWindow) {
return antModel.contextWindow
}
}
return MODEL_CONTEXT_WINDOW_DEFAULT
}
export function getSonnet1mExpTreatmentEnabled(model: string): boolean {
if (is1mContextDisabled()) {
return false
}
// Only applies to sonnet 4.6 without an explicit [1m] suffix
if (has1mContext(model)) {
return false
}
if (!getCanonicalName(model).includes('sonnet-4-6')) {
return false
}
return getGlobalConfig().clientDataCache?.['coral_reef_sonnet'] === 'true'
}
/**
* Calculate context window usage percentage from token usage data.
* Returns used and remaining percentages, or null values if no usage data.
*/
export function calculateContextPercentages(
currentUsage: {
input_tokens: number
cache_creation_input_tokens: number
cache_read_input_tokens: number
} | null,
contextWindowSize: number,
): { used: number | null; remaining: number | null } {
if (!currentUsage) {
return { used: null, remaining: null }
}
const totalInputTokens =
currentUsage.input_tokens +
currentUsage.cache_creation_input_tokens +
currentUsage.cache_read_input_tokens
const usedPercentage = Math.round(
(totalInputTokens / contextWindowSize) * 100,
)
const clampedUsed = Math.min(100, Math.max(0, usedPercentage))
return {
used: clampedUsed,
remaining: 100 - clampedUsed,
}
}
/**
* Returns the model's default and upper limit for max output tokens.
*/
export function getModelMaxOutputTokens(model: string): {
default: number
upperLimit: number
} {
let defaultTokens: number
let upperLimit: number
if (process.env.USER_TYPE === 'ant') {
const antModel = resolveAntModel(model.toLowerCase())
if (antModel) {
defaultTokens = antModel.defaultMaxTokens ?? MAX_OUTPUT_TOKENS_DEFAULT
upperLimit = antModel.upperMaxTokensLimit ?? MAX_OUTPUT_TOKENS_UPPER_LIMIT
return { default: defaultTokens, upperLimit }
}
}
const m = getCanonicalName(model)
if (m.includes('opus-4-6')) {
defaultTokens = 64_000
upperLimit = 128_000
} else if (m.includes('sonnet-4-6')) {
defaultTokens = 32_000
upperLimit = 128_000
} else if (
m.includes('opus-4-5') ||
m.includes('sonnet-4') ||
m.includes('haiku-4')
) {
defaultTokens = 32_000
upperLimit = 64_000
} else if (m.includes('opus-4-1') || m.includes('opus-4')) {
defaultTokens = 32_000
upperLimit = 32_000
} else if (m.includes('claude-3-opus')) {
defaultTokens = 4_096
upperLimit = 4_096
} else if (m.includes('claude-3-sonnet')) {
defaultTokens = 8_192
upperLimit = 8_192
} else if (m.includes('claude-3-haiku')) {
defaultTokens = 4_096
upperLimit = 4_096
} else if (m.includes('3-5-sonnet') || m.includes('3-5-haiku')) {
defaultTokens = 8_192
upperLimit = 8_192
} else if (m.includes('3-7-sonnet')) {
defaultTokens = 32_000
upperLimit = 64_000
} else {
defaultTokens = MAX_OUTPUT_TOKENS_DEFAULT
upperLimit = MAX_OUTPUT_TOKENS_UPPER_LIMIT
}
const cap = getModelCapability(model)
if (cap?.max_tokens && cap.max_tokens >= 4_096) {
upperLimit = cap.max_tokens
defaultTokens = Math.min(defaultTokens, upperLimit)
}
return { default: defaultTokens, upperLimit }
}
/**
* Returns the max thinking budget tokens for a given model. The max
* thinking tokens should be strictly less than the max output tokens.
*
* Deprecated since newer models use adaptive thinking rather than a
* strict thinking token budget.
*/
export function getMaxThinkingTokensForModel(model: string): number {
return getModelMaxOutputTokens(model).upperLimit - 1
}