diff --git a/src/observability/log.rs b/src/observability/log.rs index 0b7194fe2..e267f787b 100644 --- a/src/observability/log.rs +++ b/src/observability/log.rs @@ -92,6 +92,23 @@ impl Observer for LogObserver { "llm.response" ); } + ObserverEvent::HandStarted { hand_name } => { + info!(hand = %hand_name, "hand.started"); + } + ObserverEvent::HandCompleted { + hand_name, + duration_ms, + findings_count, + } => { + info!(hand = %hand_name, duration_ms = duration_ms, findings = findings_count, "hand.completed"); + } + ObserverEvent::HandFailed { + hand_name, + error, + duration_ms, + } => { + info!(hand = %hand_name, error = %error, duration_ms = duration_ms, "hand.failed"); + } } } @@ -110,6 +127,19 @@ impl Observer for LogObserver { ObserverMetric::QueueDepth(d) => { info!(depth = d, "metric.queue_depth"); } + ObserverMetric::HandRunDuration { + hand_name, + duration, + } => { + let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX); + info!(hand = %hand_name, duration_ms = ms, "metric.hand_run_duration"); + } + ObserverMetric::HandFindingsCount { hand_name, count } => { + info!(hand = %hand_name, count = count, "metric.hand_findings_count"); + } + ObserverMetric::HandSuccessRate { hand_name, success } => { + info!(hand = %hand_name, success = success, "metric.hand_success_rate"); + } } } @@ -196,4 +226,39 @@ mod tests { obs.record_metric(&ObserverMetric::ActiveSessions(1)); obs.record_metric(&ObserverMetric::QueueDepth(999)); } + + #[test] + fn log_observer_hand_events_no_panic() { + let obs = LogObserver::new(); + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } + + #[test] + fn log_observer_hand_metrics_no_panic() { + let obs = LogObserver::new(); + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + } } diff --git a/src/observability/noop.rs b/src/observability/noop.rs index 89419ca2f..9a23584de 100644 --- a/src/observability/noop.rs +++ b/src/observability/noop.rs @@ -80,4 +80,39 @@ mod tests { fn noop_flush_does_not_panic() { NoopObserver.flush(); } + + #[test] + fn noop_hand_events_do_not_panic() { + let obs = NoopObserver; + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } + + #[test] + fn noop_hand_metrics_do_not_panic() { + let obs = NoopObserver; + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + } } diff --git a/src/observability/otel.rs b/src/observability/otel.rs index 07faa977b..0dabc12d6 100644 --- a/src/observability/otel.rs +++ b/src/observability/otel.rs @@ -27,6 +27,9 @@ pub struct OtelObserver { tokens_used: Counter, active_sessions: Gauge, queue_depth: Gauge, + hand_runs: Counter, + hand_duration: Histogram, + hand_findings: Counter, } impl OtelObserver { @@ -152,6 +155,22 @@ impl OtelObserver { .with_description("Current message queue depth") .build(); + let hand_runs = meter + .u64_counter("zeroclaw.hand.runs") + .with_description("Total hand runs") + .build(); + + let hand_duration = meter + .f64_histogram("zeroclaw.hand.duration") + .with_description("Hand run duration in seconds") + .with_unit("s") + .build(); + + let hand_findings = meter + .u64_counter("zeroclaw.hand.findings") + .with_description("Total findings produced by hand runs") + .build(); + Ok(Self { tracer_provider, meter_provider: meter_provider_clone, @@ -168,6 +187,9 @@ impl OtelObserver { tokens_used, active_sessions, queue_depth, + hand_runs, + hand_duration, + hand_findings, }) } } @@ -335,6 +357,77 @@ impl Observer for OtelObserver { self.errors .add(1, &[KeyValue::new("component", component.clone())]); } + ObserverEvent::HandStarted { .. } => {} + ObserverEvent::HandCompleted { + hand_name, + duration_ms, + findings_count, + } => { + let secs = *duration_ms as f64 / 1000.0; + let duration = std::time::Duration::from_millis(*duration_ms); + let start_time = SystemTime::now() + .checked_sub(duration) + .unwrap_or(SystemTime::now()); + + let mut span = tracer.build( + opentelemetry::trace::SpanBuilder::from_name("hand.run") + .with_kind(SpanKind::Internal) + .with_start_time(start_time) + .with_attributes(vec![ + KeyValue::new("hand.name", hand_name.clone()), + KeyValue::new("hand.success", true), + KeyValue::new("hand.findings", *findings_count as i64), + KeyValue::new("duration_s", secs), + ]), + ); + span.set_status(Status::Ok); + span.end(); + + let attrs = [ + KeyValue::new("hand", hand_name.clone()), + KeyValue::new("success", "true"), + ]; + self.hand_runs.add(1, &attrs); + self.hand_duration + .record(secs, &[KeyValue::new("hand", hand_name.clone())]); + self.hand_findings.add( + *findings_count as u64, + &[KeyValue::new("hand", hand_name.clone())], + ); + } + ObserverEvent::HandFailed { + hand_name, + error, + duration_ms, + } => { + let secs = *duration_ms as f64 / 1000.0; + let duration = std::time::Duration::from_millis(*duration_ms); + let start_time = SystemTime::now() + .checked_sub(duration) + .unwrap_or(SystemTime::now()); + + let mut span = tracer.build( + opentelemetry::trace::SpanBuilder::from_name("hand.run") + .with_kind(SpanKind::Internal) + .with_start_time(start_time) + .with_attributes(vec![ + KeyValue::new("hand.name", hand_name.clone()), + KeyValue::new("hand.success", false), + KeyValue::new("error.message", error.clone()), + KeyValue::new("duration_s", secs), + ]), + ); + span.set_status(Status::error(error.clone())); + span.end(); + + let attrs = [ + KeyValue::new("hand", hand_name.clone()), + KeyValue::new("success", "false"), + ]; + self.hand_runs.add(1, &attrs); + self.hand_duration + .record(secs, &[KeyValue::new("hand", hand_name.clone())]); + } } } @@ -352,6 +445,29 @@ impl Observer for OtelObserver { ObserverMetric::QueueDepth(d) => { self.queue_depth.record(*d as u64, &[]); } + ObserverMetric::HandRunDuration { + hand_name, + duration, + } => { + self.hand_duration.record( + duration.as_secs_f64(), + &[KeyValue::new("hand", hand_name.clone())], + ); + } + ObserverMetric::HandFindingsCount { hand_name, count } => { + self.hand_findings + .add(*count, &[KeyValue::new("hand", hand_name.clone())]); + } + ObserverMetric::HandSuccessRate { hand_name, success } => { + let success_str = if *success { "true" } else { "false" }; + self.hand_runs.add( + 1, + &[ + KeyValue::new("hand", hand_name.clone()), + KeyValue::new("success", success_str), + ], + ); + } } } @@ -519,6 +635,41 @@ mod tests { obs.record_metric(&ObserverMetric::QueueDepth(0)); } + #[test] + fn otel_hand_events_do_not_panic() { + let obs = test_observer(); + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } + + #[test] + fn otel_hand_metrics_do_not_panic() { + let obs = test_observer(); + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + } + #[test] fn otel_observer_creation_with_valid_endpoint_succeeds() { // Even though endpoint is unreachable, creation should succeed diff --git a/src/observability/prometheus.rs b/src/observability/prometheus.rs index 444651913..c2076c451 100644 --- a/src/observability/prometheus.rs +++ b/src/observability/prometheus.rs @@ -29,6 +29,11 @@ pub struct PrometheusObserver { tokens_used: prometheus::IntGauge, active_sessions: GaugeVec, queue_depth: GaugeVec, + + // Hands + hand_runs: IntCounterVec, + hand_duration: HistogramVec, + hand_findings: IntCounterVec, } impl PrometheusObserver { @@ -152,6 +157,31 @@ impl PrometheusObserver { ) .expect("valid metric"); + let hand_runs = IntCounterVec::new( + prometheus::Opts::new("zeroclaw_hand_runs_total", "Total hand runs by outcome"), + &["hand", "success"], + ) + .expect("valid metric"); + + let hand_duration = HistogramVec::new( + HistogramOpts::new( + "zeroclaw_hand_duration_seconds", + "Hand run duration in seconds", + ) + .buckets(vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]), + &["hand"], + ) + .expect("valid metric"); + + let hand_findings = IntCounterVec::new( + prometheus::Opts::new( + "zeroclaw_hand_findings_total", + "Total findings produced by hand runs", + ), + &["hand"], + ) + .expect("valid metric"); + // Register all metrics registry.register(Box::new(agent_starts.clone())).ok(); registry.register(Box::new(llm_requests.clone())).ok(); @@ -172,6 +202,9 @@ impl PrometheusObserver { registry.register(Box::new(tokens_used.clone())).ok(); registry.register(Box::new(active_sessions.clone())).ok(); registry.register(Box::new(queue_depth.clone())).ok(); + registry.register(Box::new(hand_runs.clone())).ok(); + registry.register(Box::new(hand_duration.clone())).ok(); + registry.register(Box::new(hand_findings.clone())).ok(); Self { registry, @@ -192,6 +225,9 @@ impl PrometheusObserver { tokens_used, active_sessions, queue_depth, + hand_runs, + hand_duration, + hand_findings, } } @@ -293,6 +329,38 @@ impl Observer for PrometheusObserver { } => { self.errors.with_label_values(&[component]).inc(); } + ObserverEvent::HandStarted { hand_name } => { + self.hand_runs + .with_label_values(&[hand_name.as_str(), "true"]) + .inc_by(0); // touch the series so it appears in output + } + ObserverEvent::HandCompleted { + hand_name, + duration_ms, + findings_count, + } => { + self.hand_runs + .with_label_values(&[hand_name.as_str(), "true"]) + .inc(); + self.hand_duration + .with_label_values(&[hand_name.as_str()]) + .observe(*duration_ms as f64 / 1000.0); + self.hand_findings + .with_label_values(&[hand_name.as_str()]) + .inc_by(*findings_count as u64); + } + ObserverEvent::HandFailed { + hand_name, + duration_ms, + .. + } => { + self.hand_runs + .with_label_values(&[hand_name.as_str(), "false"]) + .inc(); + self.hand_duration + .with_label_values(&[hand_name.as_str()]) + .observe(*duration_ms as f64 / 1000.0); + } } } @@ -314,6 +382,25 @@ impl Observer for PrometheusObserver { .with_label_values(&[] as &[&str]) .set(*d as f64); } + ObserverMetric::HandRunDuration { + hand_name, + duration, + } => { + self.hand_duration + .with_label_values(&[hand_name.as_str()]) + .observe(duration.as_secs_f64()); + } + ObserverMetric::HandFindingsCount { hand_name, count } => { + self.hand_findings + .with_label_values(&[hand_name.as_str()]) + .inc_by(*count); + } + ObserverMetric::HandSuccessRate { hand_name, success } => { + let success_str = if *success { "true" } else { "false" }; + self.hand_runs + .with_label_values(&[hand_name.as_str(), success_str]) + .inc(); + } } } @@ -513,6 +600,61 @@ mod tests { )); } + #[test] + fn hand_events_track_runs_and_duration() { + let obs = PrometheusObserver::new(); + + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 2000, + findings_count: 1, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + + let output = obs.encode(); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="review",success="true"} 2"#)); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="review",success="false"} 1"#)); + assert!(output.contains(r#"zeroclaw_hand_findings_total{hand="review"} 4"#)); + assert!(output.contains("zeroclaw_hand_duration_seconds")); + } + + #[test] + fn hand_metrics_record_duration_and_findings() { + let obs = PrometheusObserver::new(); + + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "scan".into(), + duration: Duration::from_millis(800), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "scan".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "scan".into(), + success: true, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "scan".into(), + success: false, + }); + + let output = obs.encode(); + assert!(output.contains("zeroclaw_hand_duration_seconds")); + assert!(output.contains(r#"zeroclaw_hand_findings_total{hand="scan"} 5"#)); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="scan",success="true"} 1"#)); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="scan",success="false"} 1"#)); + } + #[test] fn llm_response_without_tokens_increments_request_only() { let obs = PrometheusObserver::new(); diff --git a/src/observability/traits.rs b/src/observability/traits.rs index 3b68feafc..7d2443535 100644 --- a/src/observability/traits.rs +++ b/src/observability/traits.rs @@ -80,6 +80,20 @@ pub enum ObserverEvent { /// Human-readable error description. Must not contain secrets or tokens. message: String, }, + /// A hand has started execution. + HandStarted { hand_name: String }, + /// A hand has completed execution successfully. + HandCompleted { + hand_name: String, + duration_ms: u64, + findings_count: usize, + }, + /// A hand has failed during execution. + HandFailed { + hand_name: String, + error: String, + duration_ms: u64, + }, } /// Numeric metrics emitted by the agent runtime. @@ -96,6 +110,15 @@ pub enum ObserverMetric { ActiveSessions(u64), /// Current depth of the inbound message queue. QueueDepth(u64), + /// Duration of a single hand run. + HandRunDuration { + hand_name: String, + duration: Duration, + }, + /// Number of findings produced by a hand run. + HandFindingsCount { hand_name: String, count: u64 }, + /// Records a hand run outcome for success-rate tracking. + HandSuccessRate { hand_name: String, success: bool }, } /// Core observability trait for recording agent runtime telemetry. @@ -212,4 +235,67 @@ mod tests { assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. })); assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_))); } + + #[test] + fn hand_events_recordable() { + let observer = DummyObserver::default(); + + observer.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + observer.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + observer.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + + assert_eq!(*observer.events.lock(), 3); + } + + #[test] + fn hand_metrics_recordable() { + let observer = DummyObserver::default(); + + observer.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + observer.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 3, + }); + observer.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + + assert_eq!(*observer.metrics.lock(), 3); + } + + #[test] + fn hand_event_and_metric_are_cloneable() { + let event = ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 500, + findings_count: 2, + }; + let metric = ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(500), + }; + + let cloned_event = event.clone(); + let cloned_metric = metric.clone(); + + assert!(matches!(cloned_event, ObserverEvent::HandCompleted { .. })); + assert!(matches!( + cloned_metric, + ObserverMetric::HandRunDuration { .. } + )); + } } diff --git a/src/observability/verbose.rs b/src/observability/verbose.rs index 12271c0f8..e15d2fd1b 100644 --- a/src/observability/verbose.rs +++ b/src/observability/verbose.rs @@ -101,4 +101,22 @@ mod tests { }); obs.record_event(&ObserverEvent::TurnComplete); } + + #[test] + fn verbose_hand_events_do_not_panic() { + let obs = VerboseObserver::new(); + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } }