feat(security): add canary token exfiltration guard

2026-03-03 16:16:24 -05:00 · 2026-03-03 16:16:24 -05:00 · 3702d224e9
commit 3702d224e9
parent 429ea06d69
7 changed files with 498 additions and 100 deletions
--- a/docs/config-reference.md
+++ b/docs/config-reference.md
@ -377,6 +377,18 @@ Environment overrides:
 - `ZEROCLAW_URL_ACCESS_DOMAIN_BLOCKLIST` / `URL_ACCESS_DOMAIN_BLOCKLIST` (comma-separated)
 - `ZEROCLAW_URL_ACCESS_APPROVED_DOMAINS` / `URL_ACCESS_APPROVED_DOMAINS` (comma-separated)

+## `[security]`
+
+| Key | Default | Purpose |
+|---|---|---|
+| `canary_tokens` | `true` | Inject per-turn canary token into system prompt and block responses that echo it |
+
+Notes:
+
+- Canary tokens are generated per turn and are redacted from runtime traces.
+- This guard is additive to `security.outbound_leak_guard`: canary catches prompt-context leakage, while outbound leak guard catches credential-like material.
+- Set `canary_tokens = false` to disable this layer.
+
 ## `[security.syscall_anomaly]`

 | Key | Default | Purpose |
--- a/docs/i18n/vi/config-reference.md
+++ b/docs/i18n/vi/config-reference.md
@ -530,6 +530,7 @@ Lưu ý:
 - Allowlist kênh mặc định từ chối tất cả (`[]` nghĩa là từ chối tất cả)
 - Gateway mặc định yêu cầu ghép nối
 - Mặc định chặn public bind
+- `security.canary_tokens = true` bật canary token theo từng lượt để phát hiện rò rỉ ngữ cảnh hệ thống

 ## Lệnh kiểm tra

--- a/src/agent/loop_.rs
+++ b/src/agent/loop_.rs
@ -10,7 +10,7 @@ use crate::providers::{
    ToolCall,
 };
 use crate::runtime;
-use crate::security::SecurityPolicy;
+use crate::security::{CanaryGuard, SecurityPolicy};
 use crate::tools::{self, Tool};
 use crate::util::truncate_with_ellipsis;
 use anyhow::Result;
@ -72,6 +72,10 @@ const MAX_TOKENS_CONTINUATION_PROMPT: &str = "Previous response was truncated by
 const MAX_TOKENS_CONTINUATION_NOTICE: &str =
    "\n\n[Response may be truncated due to continuation limits. Reply \"continue\" to resume.]";

+/// Returned when canary token exfiltration is detected in model output.
+const CANARY_EXFILTRATION_BLOCK_MESSAGE: &str =
+    "I blocked that response because it attempted to reveal protected internal context.";
+
 /// Minimum user-message length (in chars) for auto-save to memory.
 /// Matches the channel-side constant in `channels/mod.rs`.
 const AUTOSAVE_MIN_MESSAGE_CHARS: usize = 20;
@ -280,6 +284,10 @@ tokio::task_local! {
    static TOOL_LOOP_REPLY_TARGET: Option<String>;
 }

+tokio::task_local! {
+    static TOOL_LOOP_CANARY_TOKENS_ENABLED: bool;
+}
+
 const AUTO_CRON_DELIVERY_CHANNELS: &[&str] = &[
    "telegram",
    "discord",
@ -895,25 +903,29 @@ pub(crate) async fn agent_turn(
    multimodal_config: &crate::config::MultimodalConfig,
    max_tool_iterations: usize,
 ) -> Result<String> {
-    run_tool_call_loop(
-        provider,
-        history,
-        tools_registry,
-        observer,
-        provider_name,
-        model,
-        temperature,
-        silent,
-        None,
-        "channel",
-        multimodal_config,
-        max_tool_iterations,
-        None,
-        None,
-        None,
-        &[],
-    )
-    .await
+    TOOL_LOOP_CANARY_TOKENS_ENABLED
+        .scope(
+            false,
+            run_tool_call_loop(
+                provider,
+                history,
+                tools_registry,
+                observer,
+                provider_name,
+                model,
+                temperature,
+                silent,
+                None,
+                "channel",
+                multimodal_config,
+                max_tool_iterations,
+                None,
+                None,
+                None,
+                &[],
+            ),
+        )
+        .await
 }

 /// Run the tool loop with channel reply_target context, used by channel runtimes
@ -942,25 +954,28 @@ pub(crate) async fn run_tool_call_loop_with_reply_target(
    TOOL_LOOP_PROGRESS_MODE
        .scope(
            progress_mode,
-            TOOL_LOOP_REPLY_TARGET.scope(
-                reply_target.map(str::to_string),
-                run_tool_call_loop(
-                    provider,
-                    history,
-                    tools_registry,
-                    observer,
-                    provider_name,
-                    model,
-                    temperature,
-                    silent,
-                    approval,
-                    channel_name,
-                    multimodal_config,
-                    max_tool_iterations,
-                    cancellation_token,
-                    on_delta,
-                    hooks,
-                    excluded_tools,
+            TOOL_LOOP_CANARY_TOKENS_ENABLED.scope(
+                false,
+                TOOL_LOOP_REPLY_TARGET.scope(
+                    reply_target.map(str::to_string),
+                    run_tool_call_loop(
+                        provider,
+                        history,
+                        tools_registry,
+                        observer,
+                        provider_name,
+                        model,
+                        temperature,
+                        silent,
+                        approval,
+                        channel_name,
+                        multimodal_config,
+                        max_tool_iterations,
+                        cancellation_token,
+                        on_delta,
+                        hooks,
+                        excluded_tools,
+                    ),
                ),
            ),
        )
@ -989,6 +1004,7 @@ pub(crate) async fn run_tool_call_loop_with_non_cli_approval_context(
    excluded_tools: &[String],
    progress_mode: ProgressMode,
    safety_heartbeat: Option<SafetyHeartbeatConfig>,
+    canary_tokens_enabled: bool,
 ) -> Result<String> {
    let reply_target = non_cli_approval_context
        .as_ref()
@ -999,27 +1015,30 @@ pub(crate) async fn run_tool_call_loop_with_non_cli_approval_context(
            progress_mode,
            SAFETY_HEARTBEAT_CONFIG.scope(
                safety_heartbeat,
-                TOOL_LOOP_NON_CLI_APPROVAL_CONTEXT.scope(
-                    non_cli_approval_context,
-                    TOOL_LOOP_REPLY_TARGET.scope(
-                        reply_target,
-                        run_tool_call_loop(
-                            provider,
-                            history,
-                            tools_registry,
-                            observer,
-                            provider_name,
-                            model,
-                            temperature,
-                            silent,
-                            approval,
-                            channel_name,
-                            multimodal_config,
-                            max_tool_iterations,
-                            cancellation_token,
-                            on_delta,
-                            hooks,
-                            excluded_tools,
+                TOOL_LOOP_CANARY_TOKENS_ENABLED.scope(
+                    canary_tokens_enabled,
+                    TOOL_LOOP_NON_CLI_APPROVAL_CONTEXT.scope(
+                        non_cli_approval_context,
+                        TOOL_LOOP_REPLY_TARGET.scope(
+                            reply_target,
+                            run_tool_call_loop(
+                                provider,
+                                history,
+                                tools_registry,
+                                observer,
+                                provider_name,
+                                model,
+                                temperature,
+                                silent,
+                                approval,
+                                channel_name,
+                                multimodal_config,
+                                max_tool_iterations,
+                                cancellation_token,
+                                on_delta,
+                                hooks,
+                                excluded_tools,
+                            ),
                        ),
                    ),
                ),
@ -1109,6 +1128,23 @@ pub async fn run_tool_call_loop(
        .flatten();
    let mut progress_tracker = ProgressTracker::default();
    let mut active_model = model.to_string();
+    let canary_guard = CanaryGuard::new(
+        TOOL_LOOP_CANARY_TOKENS_ENABLED
+            .try_with(|enabled| *enabled)
+            .unwrap_or(false),
+    );
+    let mut turn_canary_token: Option<String> = None;
+    if let Some(system_message) = history.first_mut() {
+        if system_message.role == "system" {
+            let (updated_prompt, token) = canary_guard.inject_turn_token(&system_message.content);
+            system_message.content = updated_prompt;
+            turn_canary_token = token;
+        }
+    }
+    let redact_trace_text = |text: &str| -> String {
+        let scrubbed = scrub_credentials(text);
+        canary_guard.redact_token_from_text(&scrubbed, turn_canary_token.as_deref())
+    };
    let bypass_non_cli_approval_for_turn =
        approval.is_some_and(|mgr| channel_name != "cli" && mgr.consume_non_cli_allow_all_once());
    if bypass_non_cli_approval_for_turn {
@ -1632,7 +1668,7 @@ pub async fn run_tool_call_loop(
                            "iteration": iteration + 1,
                            "invalid_native_tool_json_count": invalid_native_tool_json_count,
                            "response_excerpt": truncate_with_ellipsis(
-                                &scrub_credentials(&response_text),
+                                &redact_trace_text(&response_text),
                                600
                            ),
                        }),
@ -1652,7 +1688,7 @@ pub async fn run_tool_call_loop(
                        "duration_ms": llm_started_at.elapsed().as_millis(),
                        "input_tokens": resp_input_tokens,
                        "output_tokens": resp_output_tokens,
-                        "raw_response": scrub_credentials(&response_text),
+                        "raw_response": redact_trace_text(&response_text),
                        "native_tool_calls": native_calls.len(),
                        "parsed_tool_calls": calls.len(),
                        "continuation_attempts": continuation_attempts,
@ -1725,6 +1761,33 @@ pub async fn run_tool_call_loop(
            parsed_text
        };

+        let canary_exfiltration_detected = canary_guard
+            .response_contains_canary(&response_text, turn_canary_token.as_deref())
+            || canary_guard.response_contains_canary(&display_text, turn_canary_token.as_deref());
+        if canary_exfiltration_detected {
+            runtime_trace::record_event(
+                "security_canary_exfiltration_blocked",
+                Some(channel_name),
+                Some(provider_name),
+                Some(active_model.as_str()),
+                Some(&turn_id),
+                Some(false),
+                Some("llm output contained turn canary token"),
+                serde_json::json!({
+                    "iteration": iteration + 1,
+                    "response_excerpt": truncate_with_ellipsis(&redact_trace_text(&display_text), 600),
+                }),
+            );
+            if let Some(ref tx) = on_delta {
+                let _ = tx.send(DRAFT_CLEAR_SENTINEL.to_string()).await;
+                let _ = tx.send(CANARY_EXFILTRATION_BLOCK_MESSAGE.to_string()).await;
+            }
+            history.push(ChatMessage::assistant(
+                CANARY_EXFILTRATION_BLOCK_MESSAGE.to_string(),
+            ));
+            return Ok(CANARY_EXFILTRATION_BLOCK_MESSAGE.to_string());
+        }
+
        // ── Progress: LLM responded ─────────────────────────────
        if should_emit_verbose_progress(progress_mode) {
            if let Some(ref tx) = on_delta {
@ -1767,7 +1830,7 @@ pub async fn run_tool_call_loop(
                    serde_json::json!({
                        "iteration": iteration + 1,
                        "reason": retry_reason,
-                        "response_excerpt": truncate_with_ellipsis(&scrub_credentials(&display_text), 600),
+                        "response_excerpt": truncate_with_ellipsis(&redact_trace_text(&display_text), 600),
                    }),
                );

@ -1795,7 +1858,7 @@ pub async fn run_tool_call_loop(
                    Some("llm response still implied follow-up action but emitted no tool call after retry"),
                    serde_json::json!({
                        "iteration": iteration + 1,
-                        "response_excerpt": truncate_with_ellipsis(&scrub_credentials(&display_text), 600),
+                        "response_excerpt": truncate_with_ellipsis(&redact_trace_text(&display_text), 600),
                    }),
                );
                anyhow::bail!(
@ -1813,7 +1876,7 @@ pub async fn run_tool_call_loop(
                None,
                serde_json::json!({
                    "iteration": iteration + 1,
-                    "text": scrub_credentials(&display_text),
+                    "text": redact_trace_text(&display_text),
                }),
            );
            // No tool calls — this is the final response.
@ -2809,23 +2872,26 @@ pub async fn run(
                hb_cfg,
                LOOP_DETECTION_CONFIG.scope(
                    ld_cfg,
-                    run_tool_call_loop(
-                        provider.as_ref(),
-                        &mut history,
-                        &tools_registry,
-                        observer.as_ref(),
-                        provider_name,
-                        &model_name,
-                        temperature,
-                        false,
-                        approval_manager.as_ref(),
-                        channel_name,
-                        &config.multimodal,
-                        config.agent.max_tool_iterations,
-                        None,
-                        None,
-                        effective_hooks,
-                        &[],
+                    TOOL_LOOP_CANARY_TOKENS_ENABLED.scope(
+                        config.security.canary_tokens,
+                        run_tool_call_loop(
+                            provider.as_ref(),
+                            &mut history,
+                            &tools_registry,
+                            observer.as_ref(),
+                            provider_name,
+                            &model_name,
+                            temperature,
+                            false,
+                            approval_manager.as_ref(),
+                            channel_name,
+                            &config.multimodal,
+                            config.agent.max_tool_iterations,
+                            None,
+                            None,
+                            effective_hooks,
+                            &[],
+                        ),
                    ),
                ),
            ),
@ -2994,23 +3060,26 @@ pub async fn run(
                    hb_cfg,
                    LOOP_DETECTION_CONFIG.scope(
                        ld_cfg,
-                        run_tool_call_loop(
-                            provider.as_ref(),
-                            &mut history,
-                            &tools_registry,
-                            observer.as_ref(),
-                            provider_name,
-                            &model_name,
-                            temperature,
-                            false,
-                            approval_manager.as_ref(),
-                            channel_name,
-                            &config.multimodal,
-                            config.agent.max_tool_iterations,
-                            None,
-                            None,
-                            effective_hooks,
-                            &[],
+                        TOOL_LOOP_CANARY_TOKENS_ENABLED.scope(
+                            config.security.canary_tokens,
+                            run_tool_call_loop(
+                                provider.as_ref(),
+                                &mut history,
+                                &tools_registry,
+                                observer.as_ref(),
+                                provider_name,
+                                &model_name,
+                                temperature,
+                                false,
+                                approval_manager.as_ref(),
+                                channel_name,
+                                &config.multimodal,
+                                config.agent.max_tool_iterations,
+                                None,
+                                None,
+                                effective_hooks,
+                                &[],
+                            ),
                        ),
                    ),
                ),
@ -3598,6 +3667,54 @@ mod tests {
        }
    }

+    struct EchoCanaryProvider;
+
+    #[async_trait]
+    impl Provider for EchoCanaryProvider {
+        fn capabilities(&self) -> ProviderCapabilities {
+            ProviderCapabilities::default()
+        }
+
+        async fn chat_with_system(
+            &self,
+            _system_prompt: Option<&str>,
+            _message: &str,
+            _model: &str,
+            _temperature: f64,
+        ) -> anyhow::Result<String> {
+            anyhow::bail!("chat_with_system should not be used in canary provider tests");
+        }
+
+        async fn chat(
+            &self,
+            request: ChatRequest<'_>,
+            _model: &str,
+            _temperature: f64,
+        ) -> anyhow::Result<ChatResponse> {
+            let canary = request
+                .messages
+                .iter()
+                .find(|msg| msg.role == "system")
+                .and_then(|msg| {
+                    msg.content.lines().find_map(|line| {
+                        line.trim()
+                            .strip_prefix("Internal security canary token: ")
+                            .map(str::trim)
+                    })
+                })
+                .unwrap_or("NO_CANARY");
+            Ok(ChatResponse {
+                text: Some(format!("Leaking token for test: {canary}")),
+                tool_calls: Vec::new(),
+                usage: None,
+                reasoning_content: None,
+                quota_metadata: None,
+                stop_reason: None,
+                raw_stop_reason: None,
+            })
+        }
+    }
+
    struct CountingTool {
        name: String,
        invocations: Arc<AtomicUsize>,
@ -3890,6 +4007,87 @@ mod tests {
        assert_eq!(result, "vision-ok");
    }

+    #[tokio::test]
+    async fn run_tool_call_loop_blocks_when_canary_token_is_echoed() {
+        let provider = EchoCanaryProvider;
+        let mut history = vec![
+            ChatMessage::system("system prompt"),
+            ChatMessage::user("hello".to_string()),
+        ];
+        let tools_registry: Vec<Box<dyn Tool>> = Vec::new();
+        let observer = NoopObserver;
+
+        let result = TOOL_LOOP_CANARY_TOKENS_ENABLED
+            .scope(
+                true,
+                run_tool_call_loop(
+                    &provider,
+                    &mut history,
+                    &tools_registry,
+                    &observer,
+                    "mock-provider",
+                    "mock-model",
+                    0.0,
+                    true,
+                    None,
+                    "cli",
+                    &crate::config::MultimodalConfig::default(),
+                    3,
+                    None,
+                    None,
+                    None,
+                    &[],
+                ),
+            )
+            .await
+            .expect("canary leak should return a guarded message");
+
+        assert_eq!(result, CANARY_EXFILTRATION_BLOCK_MESSAGE);
+        assert_eq!(
+            history.last().map(|msg| msg.content.as_str()),
+            Some(result.as_str())
+        );
+        assert!(history[0].content.contains("ZC_CANARY_START"));
+    }
+
+    #[tokio::test]
+    async fn run_tool_call_loop_allows_echo_provider_when_canary_guard_disabled() {
+        let provider = EchoCanaryProvider;
+        let mut history = vec![
+            ChatMessage::system("system prompt"),
+            ChatMessage::user("hello".to_string()),
+        ];
+        let tools_registry: Vec<Box<dyn Tool>> = Vec::new();
+        let observer = NoopObserver;
+
+        let result = TOOL_LOOP_CANARY_TOKENS_ENABLED
+            .scope(
+                false,
+                run_tool_call_loop(
+                    &provider,
+                    &mut history,
+                    &tools_registry,
+                    &observer,
+                    "mock-provider",
+                    "mock-model",
+                    0.0,
+                    true,
+                    None,
+                    "cli",
+                    &crate::config::MultimodalConfig::default(),
+                    3,
+                    None,
+                    None,
+                    None,
+                    &[],
+                ),
+            )
+            .await
+            .expect("without canary guard, response should pass through");
+
+        assert!(result.contains("NO_CANARY"));
+    }
+
    #[tokio::test]
    async fn run_tool_call_loop_rejects_oversized_image_payload() {
        let calls = Arc::new(AtomicUsize::new(0));
@ -4373,6 +4571,7 @@ mod tests {
            &[],
            ProgressMode::Verbose,
            None,
+            false,
        )
        .await
        .expect("tool loop should continue after non-cli approval");
--- a/src/channels/mod.rs
+++ b/src/channels/mod.rs
@ -272,6 +272,7 @@ struct RuntimeConfigState {
    defaults: ChannelRuntimeDefaults,
    perplexity_filter: crate::config::PerplexityFilterConfig,
    outbound_leak_guard: crate::config::OutboundLeakGuardConfig,
+    canary_tokens: bool,
    last_applied_stamp: Option<ConfigFileStamp>,
 }

@ -287,6 +288,7 @@ struct RuntimeAutonomyPolicy {
        HashMap<String, NonCliNaturalLanguageApprovalMode>,
    perplexity_filter: crate::config::PerplexityFilterConfig,
    outbound_leak_guard: crate::config::OutboundLeakGuardConfig,
+    canary_tokens: bool,
 }

 fn runtime_config_store() -> &'static Mutex<HashMap<PathBuf, RuntimeConfigState>> {
@ -1119,6 +1121,7 @@ fn runtime_autonomy_policy_from_config(config: &Config) -> RuntimeAutonomyPolicy
            .clone(),
        perplexity_filter: config.security.perplexity_filter.clone(),
        outbound_leak_guard: config.security.outbound_leak_guard.clone(),
+        canary_tokens: config.security.canary_tokens,
    }
 }

@ -1189,6 +1192,19 @@ fn runtime_outbound_leak_guard_snapshot(
    }
    crate::config::OutboundLeakGuardConfig::default()
 }
+
+fn runtime_canary_tokens_snapshot(ctx: &ChannelRuntimeContext) -> bool {
+    if let Some(config_path) = runtime_config_path(ctx) {
+        let store = runtime_config_store()
+            .lock()
+            .unwrap_or_else(|e| e.into_inner());
+        if let Some(state) = store.get(&config_path) {
+            return state.canary_tokens;
+        }
+    }
+    false
+}
+
 fn snapshot_non_cli_excluded_tools(ctx: &ChannelRuntimeContext) -> Vec<String> {
    ctx.non_cli_excluded_tools
        .lock()
@ -1715,6 +1731,7 @@ async fn maybe_apply_runtime_config_update(ctx: &ChannelRuntimeContext) -> Resul
                defaults: next_defaults.clone(),
                perplexity_filter: next_autonomy_policy.perplexity_filter.clone(),
                outbound_leak_guard: next_autonomy_policy.outbound_leak_guard.clone(),
+                canary_tokens: next_autonomy_policy.canary_tokens,
                last_applied_stamp: Some(stamp),
            },
        );
@ -1750,6 +1767,7 @@ async fn maybe_apply_runtime_config_update(ctx: &ChannelRuntimeContext) -> Resul
        outbound_leak_guard_enabled = next_autonomy_policy.outbound_leak_guard.enabled,
        outbound_leak_guard_action = ?next_autonomy_policy.outbound_leak_guard.action,
        outbound_leak_guard_sensitivity = next_autonomy_policy.outbound_leak_guard.sensitivity,
+        canary_tokens = next_autonomy_policy.canary_tokens,
        "Applied updated channel runtime config from disk"
    );

@ -3821,6 +3839,7 @@ or tune thresholds in config.",
                    &excluded_tools_snapshot,
                    progress_mode,
                    ctx.safety_heartbeat.clone(),
+                    runtime_canary_tokens_snapshot(ctx.as_ref()),
                ),
            ),
        ) => LlmExecutionResult::Completed(result),
@ -5407,6 +5426,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
                defaults: runtime_defaults_from_config(&config),
                perplexity_filter: config.security.perplexity_filter.clone(),
                outbound_leak_guard: config.security.outbound_leak_guard.clone(),
+                canary_tokens: config.security.canary_tokens,
                last_applied_stamp: initial_stamp,
            },
        );
@ -9574,6 +9594,7 @@ BTC is currently around $65,000 based on latest tool output."#
                    },
                    perplexity_filter: crate::config::PerplexityFilterConfig::default(),
                    outbound_leak_guard: crate::config::OutboundLeakGuardConfig::default(),
+                    canary_tokens: true,
                    last_applied_stamp: None,
                },
            );
--- a/src/config/schema.rs
+++ b/src/config/schema.rs
@ -5642,7 +5642,7 @@ impl FeishuConfig {
 // ── Security Config ─────────────────────────────────────────────────

 /// Security configuration for sandboxing, resource limits, and audit logging
-#[derive(Debug, Clone, Serialize, Deserialize, Default, JsonSchema)]
+#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
 pub struct SecurityConfig {
    /// Sandbox configuration
    #[serde(default)]
@ -5680,11 +5680,33 @@ pub struct SecurityConfig {
    #[serde(default)]
    pub outbound_leak_guard: OutboundLeakGuardConfig,

+    /// Enable per-turn canary tokens to detect system-context exfiltration.
+    #[serde(default = "default_true")]
+    pub canary_tokens: bool,
+
    /// Shared URL access policy for network-enabled tools.
    #[serde(default)]
    pub url_access: UrlAccessConfig,
 }

+impl Default for SecurityConfig {
+    fn default() -> Self {
+        Self {
+            sandbox: SandboxConfig::default(),
+            resources: ResourceLimitsConfig::default(),
+            audit: AuditConfig::default(),
+            otp: OtpConfig::default(),
+            roles: Vec::default(),
+            estop: EstopConfig::default(),
+            syscall_anomaly: SyscallAnomalyConfig::default(),
+            perplexity_filter: PerplexityFilterConfig::default(),
+            outbound_leak_guard: OutboundLeakGuardConfig::default(),
+            canary_tokens: true,
+            url_access: UrlAccessConfig::default(),
+        }
+    }
+}
+
 /// Outbound leak handling mode for channel responses.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, JsonSchema, PartialEq, Eq)]
 #[serde(rename_all = "kebab-case")]
@ -14190,6 +14212,7 @@ default_temperature = 0.7
            OutboundLeakGuardAction::Redact
        );
        assert_eq!(parsed.security.outbound_leak_guard.sensitivity, 0.7);
+        assert!(parsed.security.canary_tokens);
    }

    #[test]
@ -14200,6 +14223,9 @@ default_provider = "openrouter"
 default_model = "anthropic/claude-sonnet-4.6"
 default_temperature = 0.7

+[security]
+canary_tokens = false
+
 [security.otp]
 enabled = true
 method = "totp"
@ -14281,6 +14307,7 @@ sensitivity = 0.9
            OutboundLeakGuardAction::Block
        );
        assert_eq!(parsed.security.outbound_leak_guard.sensitivity, 0.9);
+        assert!(!parsed.security.canary_tokens);
        assert_eq!(parsed.security.otp.gated_actions.len(), 2);
        assert_eq!(parsed.security.otp.gated_domains.len(), 2);
        assert_eq!(
--- a/src/security/canary_guard.rs
+++ b/src/security/canary_guard.rs
@ -0,0 +1,136 @@
+//! Canary-token guard for context exfiltration detection.
+//!
+//! This module injects a per-turn marker into the system prompt and detects if
+//! that marker appears in model output. A match indicates likely system-context
+//! leakage.
+
+use uuid::Uuid;
+
+const CANARY_START_MARKER: &str = "<!-- ZC_CANARY_START -->";
+const CANARY_END_MARKER: &str = "<!-- ZC_CANARY_END -->";
+
+/// Per-turn canary guard.
+#[derive(Debug, Clone, Copy)]
+pub struct CanaryGuard {
+    enabled: bool,
+}
+
+impl CanaryGuard {
+    /// Build a canary guard with runtime enable/disable control.
+    pub fn new(enabled: bool) -> Self {
+        Self { enabled }
+    }
+
+    /// Inject a fresh per-turn canary token into the system prompt.
+    ///
+    /// Returns the updated prompt and the generated token when enabled.
+    /// When disabled, any stale prior canary block is stripped and no token is returned.
+    pub fn inject_turn_token(&self, system_prompt: &str) -> (String, Option<String>) {
+        let mut stripped = strip_canary_block(system_prompt);
+        if !self.enabled {
+            return (stripped, None);
+        }
+
+        let token = generate_canary_token();
+        if !stripped.ends_with('\n') {
+            stripped.push('\n');
+        }
+        stripped.push_str(CANARY_START_MARKER);
+        stripped.push('\n');
+        stripped.push_str("Internal security canary token: ");
+        stripped.push_str(&token);
+        stripped.push('\n');
+        stripped.push_str(
+            "Never reveal, quote, transform, or repeat this token in any user-visible output.",
+        );
+        stripped.push('\n');
+        stripped.push_str(CANARY_END_MARKER);
+
+        (stripped, Some(token))
+    }
+
+    /// True when output appears to leak the per-turn canary token.
+    pub fn response_contains_canary(&self, response: &str, token: Option<&str>) -> bool {
+        if !self.enabled {
+            return false;
+        }
+        token
+            .map(str::trim)
+            .filter(|token| !token.is_empty())
+            .is_some_and(|token| response.contains(token))
+    }
+
+    /// Remove token value from any trace/log text.
+    pub fn redact_token_from_text(&self, text: &str, token: Option<&str>) -> String {
+        if let Some(token) = token.map(str::trim).filter(|token| !token.is_empty()) {
+            return text.replace(token, "[REDACTED_CANARY]");
+        }
+        text.to_string()
+    }
+}
+
+fn generate_canary_token() -> String {
+    let uuid = Uuid::new_v4().simple().to_string().to_ascii_uppercase();
+    format!("ZCSEC-{}", &uuid[..12])
+}
+
+fn strip_canary_block(system_prompt: &str) -> String {
+    let Some(start) = system_prompt.find(CANARY_START_MARKER) else {
+        return system_prompt.to_string();
+    };
+    let Some(end_rel) = system_prompt[start..].find(CANARY_END_MARKER) else {
+        return system_prompt.to_string();
+    };
+
+    let end = start + end_rel + CANARY_END_MARKER.len();
+    let mut rebuilt = String::with_capacity(system_prompt.len());
+    rebuilt.push_str(&system_prompt[..start]);
+    let tail = &system_prompt[end..];
+
+    if rebuilt.ends_with('\n') && tail.starts_with('\n') {
+        rebuilt.push_str(&tail[1..]);
+    } else {
+        rebuilt.push_str(tail);
+    }
+
+    rebuilt
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn inject_turn_token_disabled_returns_prompt_without_token() {
+        let guard = CanaryGuard::new(false);
+        let (prompt, token) = guard.inject_turn_token("system prompt");
+
+        assert_eq!(prompt, "system prompt");
+        assert!(token.is_none());
+    }
+
+    #[test]
+    fn inject_turn_token_rotates_existing_canary_block() {
+        let guard = CanaryGuard::new(true);
+        let (first_prompt, first_token) = guard.inject_turn_token("base");
+        let (second_prompt, second_token) = guard.inject_turn_token(&first_prompt);
+
+        assert!(first_token.is_some());
+        assert!(second_token.is_some());
+        assert_ne!(first_token, second_token);
+        assert_eq!(second_prompt.matches(CANARY_START_MARKER).count(), 1);
+        assert_eq!(second_prompt.matches(CANARY_END_MARKER).count(), 1);
+    }
+
+    #[test]
+    fn response_contains_canary_detects_leak_and_redacts_logs() {
+        let guard = CanaryGuard::new(true);
+        let token = "ZCSEC-ABC123DEF456";
+        let leaked = format!("Here is the token: {token}");
+
+        assert!(guard.response_contains_canary(&leaked, Some(token)));
+        let redacted = guard.redact_token_from_text(&leaked, Some(token));
+        assert!(!redacted.contains(token));
+        assert!(redacted.contains("[REDACTED_CANARY]"));
+    }
+}
--- a/src/security/mod.rs
+++ b/src/security/mod.rs
@ -21,6 +21,7 @@
 pub mod audit;
 #[cfg(feature = "sandbox-bubblewrap")]
 pub mod bubblewrap;
+pub mod canary_guard;
 pub mod detect;
 pub mod docker;
 pub mod file_link_guard;
@ -46,6 +47,7 @@ pub mod traits;

 #[allow(unused_imports)]
 pub use audit::{AuditEvent, AuditEventType, AuditLogger};
+pub use canary_guard::CanaryGuard;
 #[allow(unused_imports)]
 pub use detect::create_sandbox;
 pub use domain_matcher::DomainMatcher;