Compare commits

...

2 Commits

Author SHA1 Message Date
argenis de la rosa
0982f74aee style: fix rustfmt alignment in test assertions 2026-03-14 07:07:02 -04:00
argenis de la rosa
db58d8f473 fix: proactively truncate channel history before sending to model
Channel sessions accumulate conversation history up to 50 turns with no
token-budget check before the LLM call.  On models with smaller context
windows (e.g. GLM 131K) this causes "context window exceeded" errors
after extended conversations.  The existing compaction logic only runs
*after* the error, wasting a round-trip.

Add `truncate_history_to_token_budget` which estimates token count using
a conservative 4-chars-per-token heuristic and drops the oldest
non-system turns until the history fits within a 100K token budget.  The
function is called right after building the outbound history, before any
provider call.

Closes #3460
2026-03-14 06:55:22 -04:00

View File

@ -189,6 +189,13 @@ const MEMORY_CONTEXT_ENTRY_MAX_CHARS: usize = 800;
const MEMORY_CONTEXT_MAX_CHARS: usize = 4_000;
const CHANNEL_HISTORY_COMPACT_KEEP_MESSAGES: usize = 12;
const CHANNEL_HISTORY_COMPACT_CONTENT_CHARS: usize = 600;
/// Conservative token budget for channel history. Before sending to the model,
/// the history is proactively truncated to fit within this limit using a
/// character-based estimate (~4 chars per token). This prevents context-window
/// overflow errors that would otherwise require a round-trip to the API.
const MAX_HISTORY_ESTIMATED_TOKENS: usize = 100_000;
/// Approximate characters per token for the conservative estimator.
const CHARS_PER_TOKEN_ESTIMATE: usize = 4;
/// Guardrail for hook-modified outbound channel content.
const CHANNEL_HOOK_MAX_OUTBOUND_CHARS: usize = 20_000;
@ -919,6 +926,52 @@ fn compact_sender_history(ctx: &ChannelRuntimeContext, sender_key: &str) -> bool
true
}
/// Estimate the total token count of a message list using a conservative
/// character-based heuristic (~4 characters per token).
fn estimate_history_tokens(messages: &[ChatMessage]) -> usize {
messages
.iter()
.map(|m| m.content.len().div_ceil(CHARS_PER_TOKEN_ESTIMATE))
.sum()
}
/// Proactively truncate conversation history so the estimated token count
/// stays within `MAX_HISTORY_ESTIMATED_TOKENS`. The system message (first
/// element) and the most recent user message (last element) are always
/// preserved; the oldest non-system turns are dropped first.
///
/// Returns `true` if any messages were removed.
fn truncate_history_to_token_budget(history: &mut Vec<ChatMessage>) -> bool {
if history.len() <= 2 {
return false;
}
let mut estimated = estimate_history_tokens(history);
if estimated <= MAX_HISTORY_ESTIMATED_TOKENS {
return false;
}
let mut removed = 0usize;
// Remove oldest non-system messages (index 1 .. len-1) until within budget.
while estimated > MAX_HISTORY_ESTIMATED_TOKENS && history.len() > 2 {
let dropped_tokens = history[1].content.len().div_ceil(CHARS_PER_TOKEN_ESTIMATE);
history.remove(1);
estimated = estimated.saturating_sub(dropped_tokens);
removed += 1;
}
if removed > 0 {
tracing::info!(
removed_turns = removed,
estimated_tokens = estimated,
remaining_turns = history.len(),
"Proactively truncated channel history to fit token budget"
);
}
removed > 0
}
fn append_sender_turn(ctx: &ChannelRuntimeContext, sender_key: &str, turn: ChatMessage) {
let mut histories = ctx
.conversation_histories
@ -1814,6 +1867,12 @@ async fn process_channel_message(
build_channel_system_prompt(ctx.system_prompt.as_str(), &msg.channel, &msg.reply_target);
let mut history = vec![ChatMessage::system(system_prompt)];
history.extend(prior_turns);
// Proactively truncate history to avoid context-window overflow errors.
// This drops the oldest non-system turns until the estimated token count
// is within the budget, preventing a wasted round-trip to the API.
truncate_history_to_token_budget(&mut history);
let use_streaming = target_channel
.as_ref()
.is_some_and(|ch| ch.supports_draft_updates());
@ -4020,6 +4079,59 @@ mod tests {
}));
}
#[test]
fn estimate_history_tokens_basic() {
let messages = vec![
ChatMessage::system("a".repeat(400)), // 100 tokens
ChatMessage::user("b".repeat(800)), // 200 tokens
];
assert_eq!(estimate_history_tokens(&messages), 300);
}
#[test]
fn truncate_history_within_budget_is_noop() {
let mut history = vec![
ChatMessage::system("sys"),
ChatMessage::user("short message"),
];
assert!(!truncate_history_to_token_budget(&mut history));
assert_eq!(history.len(), 2);
}
#[test]
fn truncate_history_drops_oldest_non_system_turns() {
// Build history that exceeds MAX_HISTORY_ESTIMATED_TOKENS.
// Each turn ~25K tokens => 5 turns = 125K tokens > 100K budget.
let big = "x".repeat(100_000); // ~25K tokens each
let mut history = vec![
ChatMessage::system("system prompt"),
ChatMessage::user(big.clone()),
ChatMessage::assistant(big.clone()),
ChatMessage::user(big.clone()),
ChatMessage::assistant(big.clone()),
ChatMessage::user("latest question"),
];
assert!(truncate_history_to_token_budget(&mut history));
// System prompt and latest question must survive
assert_eq!(history[0].role, "system");
assert_eq!(history.last().unwrap().content, "latest question");
// Total estimated tokens should now be within budget
assert!(estimate_history_tokens(&history) <= MAX_HISTORY_ESTIMATED_TOKENS);
}
#[test]
fn truncate_history_preserves_system_and_last_user() {
// Two messages only — should never truncate below 2
let big = "y".repeat(500_000);
let mut history = vec![ChatMessage::system(big.clone()), ChatMessage::user(big)];
// Even if over budget, cannot drop below 2
let result = truncate_history_to_token_budget(&mut history);
assert_eq!(history.len(), 2);
// If still over budget after removing all middle messages, that's OK —
// the function only drops middle turns.
assert!(!result);
}
#[test]
fn append_sender_turn_stores_single_turn_per_call() {
let sender = "telegram_u2".to_string();