From 2deb91455d9bb719c2da8622724e63fc716b8b52 Mon Sep 17 00:00:00 2001 From: Giulio V Date: Mon, 16 Mar 2026 23:24:47 +0100 Subject: [PATCH] feat(observability): add Hands dashboard metrics and events (#3595) Add HandStarted, HandCompleted, and HandFailed event variants to ObserverEvent, and HandRunDuration, HandFindingsCount, HandSuccessRate metric variants to ObserverMetric. Update all observer backends (log, noop, verbose, prometheus, otel) to handle the new variants with appropriate instrumentation. Prometheus backend registers hand_runs counter, hand_duration histogram, and hand_findings counter. OTel backend creates spans and records metrics for hand runs. --- src/observability/log.rs | 65 ++++++++++++++ src/observability/noop.rs | 35 ++++++++ src/observability/otel.rs | 151 ++++++++++++++++++++++++++++++++ src/observability/prometheus.rs | 142 ++++++++++++++++++++++++++++++ src/observability/traits.rs | 86 ++++++++++++++++++ src/observability/verbose.rs | 18 ++++ 6 files changed, 497 insertions(+) diff --git a/src/observability/log.rs b/src/observability/log.rs index 0b7194fe2..e267f787b 100644 --- a/src/observability/log.rs +++ b/src/observability/log.rs @@ -92,6 +92,23 @@ impl Observer for LogObserver { "llm.response" ); } + ObserverEvent::HandStarted { hand_name } => { + info!(hand = %hand_name, "hand.started"); + } + ObserverEvent::HandCompleted { + hand_name, + duration_ms, + findings_count, + } => { + info!(hand = %hand_name, duration_ms = duration_ms, findings = findings_count, "hand.completed"); + } + ObserverEvent::HandFailed { + hand_name, + error, + duration_ms, + } => { + info!(hand = %hand_name, error = %error, duration_ms = duration_ms, "hand.failed"); + } } } @@ -110,6 +127,19 @@ impl Observer for LogObserver { ObserverMetric::QueueDepth(d) => { info!(depth = d, "metric.queue_depth"); } + ObserverMetric::HandRunDuration { + hand_name, + duration, + } => { + let ms = u64::try_from(duration.as_millis()).unwrap_or(u64::MAX); + info!(hand = %hand_name, duration_ms = ms, "metric.hand_run_duration"); + } + ObserverMetric::HandFindingsCount { hand_name, count } => { + info!(hand = %hand_name, count = count, "metric.hand_findings_count"); + } + ObserverMetric::HandSuccessRate { hand_name, success } => { + info!(hand = %hand_name, success = success, "metric.hand_success_rate"); + } } } @@ -196,4 +226,39 @@ mod tests { obs.record_metric(&ObserverMetric::ActiveSessions(1)); obs.record_metric(&ObserverMetric::QueueDepth(999)); } + + #[test] + fn log_observer_hand_events_no_panic() { + let obs = LogObserver::new(); + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } + + #[test] + fn log_observer_hand_metrics_no_panic() { + let obs = LogObserver::new(); + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + } } diff --git a/src/observability/noop.rs b/src/observability/noop.rs index 89419ca2f..9a23584de 100644 --- a/src/observability/noop.rs +++ b/src/observability/noop.rs @@ -80,4 +80,39 @@ mod tests { fn noop_flush_does_not_panic() { NoopObserver.flush(); } + + #[test] + fn noop_hand_events_do_not_panic() { + let obs = NoopObserver; + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } + + #[test] + fn noop_hand_metrics_do_not_panic() { + let obs = NoopObserver; + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + } } diff --git a/src/observability/otel.rs b/src/observability/otel.rs index 07faa977b..0dabc12d6 100644 --- a/src/observability/otel.rs +++ b/src/observability/otel.rs @@ -27,6 +27,9 @@ pub struct OtelObserver { tokens_used: Counter, active_sessions: Gauge, queue_depth: Gauge, + hand_runs: Counter, + hand_duration: Histogram, + hand_findings: Counter, } impl OtelObserver { @@ -152,6 +155,22 @@ impl OtelObserver { .with_description("Current message queue depth") .build(); + let hand_runs = meter + .u64_counter("zeroclaw.hand.runs") + .with_description("Total hand runs") + .build(); + + let hand_duration = meter + .f64_histogram("zeroclaw.hand.duration") + .with_description("Hand run duration in seconds") + .with_unit("s") + .build(); + + let hand_findings = meter + .u64_counter("zeroclaw.hand.findings") + .with_description("Total findings produced by hand runs") + .build(); + Ok(Self { tracer_provider, meter_provider: meter_provider_clone, @@ -168,6 +187,9 @@ impl OtelObserver { tokens_used, active_sessions, queue_depth, + hand_runs, + hand_duration, + hand_findings, }) } } @@ -335,6 +357,77 @@ impl Observer for OtelObserver { self.errors .add(1, &[KeyValue::new("component", component.clone())]); } + ObserverEvent::HandStarted { .. } => {} + ObserverEvent::HandCompleted { + hand_name, + duration_ms, + findings_count, + } => { + let secs = *duration_ms as f64 / 1000.0; + let duration = std::time::Duration::from_millis(*duration_ms); + let start_time = SystemTime::now() + .checked_sub(duration) + .unwrap_or(SystemTime::now()); + + let mut span = tracer.build( + opentelemetry::trace::SpanBuilder::from_name("hand.run") + .with_kind(SpanKind::Internal) + .with_start_time(start_time) + .with_attributes(vec![ + KeyValue::new("hand.name", hand_name.clone()), + KeyValue::new("hand.success", true), + KeyValue::new("hand.findings", *findings_count as i64), + KeyValue::new("duration_s", secs), + ]), + ); + span.set_status(Status::Ok); + span.end(); + + let attrs = [ + KeyValue::new("hand", hand_name.clone()), + KeyValue::new("success", "true"), + ]; + self.hand_runs.add(1, &attrs); + self.hand_duration + .record(secs, &[KeyValue::new("hand", hand_name.clone())]); + self.hand_findings.add( + *findings_count as u64, + &[KeyValue::new("hand", hand_name.clone())], + ); + } + ObserverEvent::HandFailed { + hand_name, + error, + duration_ms, + } => { + let secs = *duration_ms as f64 / 1000.0; + let duration = std::time::Duration::from_millis(*duration_ms); + let start_time = SystemTime::now() + .checked_sub(duration) + .unwrap_or(SystemTime::now()); + + let mut span = tracer.build( + opentelemetry::trace::SpanBuilder::from_name("hand.run") + .with_kind(SpanKind::Internal) + .with_start_time(start_time) + .with_attributes(vec![ + KeyValue::new("hand.name", hand_name.clone()), + KeyValue::new("hand.success", false), + KeyValue::new("error.message", error.clone()), + KeyValue::new("duration_s", secs), + ]), + ); + span.set_status(Status::error(error.clone())); + span.end(); + + let attrs = [ + KeyValue::new("hand", hand_name.clone()), + KeyValue::new("success", "false"), + ]; + self.hand_runs.add(1, &attrs); + self.hand_duration + .record(secs, &[KeyValue::new("hand", hand_name.clone())]); + } } } @@ -352,6 +445,29 @@ impl Observer for OtelObserver { ObserverMetric::QueueDepth(d) => { self.queue_depth.record(*d as u64, &[]); } + ObserverMetric::HandRunDuration { + hand_name, + duration, + } => { + self.hand_duration.record( + duration.as_secs_f64(), + &[KeyValue::new("hand", hand_name.clone())], + ); + } + ObserverMetric::HandFindingsCount { hand_name, count } => { + self.hand_findings + .add(*count, &[KeyValue::new("hand", hand_name.clone())]); + } + ObserverMetric::HandSuccessRate { hand_name, success } => { + let success_str = if *success { "true" } else { "false" }; + self.hand_runs.add( + 1, + &[ + KeyValue::new("hand", hand_name.clone()), + KeyValue::new("success", success_str), + ], + ); + } } } @@ -519,6 +635,41 @@ mod tests { obs.record_metric(&ObserverMetric::QueueDepth(0)); } + #[test] + fn otel_hand_events_do_not_panic() { + let obs = test_observer(); + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } + + #[test] + fn otel_hand_metrics_do_not_panic() { + let obs = test_observer(); + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + } + #[test] fn otel_observer_creation_with_valid_endpoint_succeeds() { // Even though endpoint is unreachable, creation should succeed diff --git a/src/observability/prometheus.rs b/src/observability/prometheus.rs index 444651913..c2076c451 100644 --- a/src/observability/prometheus.rs +++ b/src/observability/prometheus.rs @@ -29,6 +29,11 @@ pub struct PrometheusObserver { tokens_used: prometheus::IntGauge, active_sessions: GaugeVec, queue_depth: GaugeVec, + + // Hands + hand_runs: IntCounterVec, + hand_duration: HistogramVec, + hand_findings: IntCounterVec, } impl PrometheusObserver { @@ -152,6 +157,31 @@ impl PrometheusObserver { ) .expect("valid metric"); + let hand_runs = IntCounterVec::new( + prometheus::Opts::new("zeroclaw_hand_runs_total", "Total hand runs by outcome"), + &["hand", "success"], + ) + .expect("valid metric"); + + let hand_duration = HistogramVec::new( + HistogramOpts::new( + "zeroclaw_hand_duration_seconds", + "Hand run duration in seconds", + ) + .buckets(vec![0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0]), + &["hand"], + ) + .expect("valid metric"); + + let hand_findings = IntCounterVec::new( + prometheus::Opts::new( + "zeroclaw_hand_findings_total", + "Total findings produced by hand runs", + ), + &["hand"], + ) + .expect("valid metric"); + // Register all metrics registry.register(Box::new(agent_starts.clone())).ok(); registry.register(Box::new(llm_requests.clone())).ok(); @@ -172,6 +202,9 @@ impl PrometheusObserver { registry.register(Box::new(tokens_used.clone())).ok(); registry.register(Box::new(active_sessions.clone())).ok(); registry.register(Box::new(queue_depth.clone())).ok(); + registry.register(Box::new(hand_runs.clone())).ok(); + registry.register(Box::new(hand_duration.clone())).ok(); + registry.register(Box::new(hand_findings.clone())).ok(); Self { registry, @@ -192,6 +225,9 @@ impl PrometheusObserver { tokens_used, active_sessions, queue_depth, + hand_runs, + hand_duration, + hand_findings, } } @@ -293,6 +329,38 @@ impl Observer for PrometheusObserver { } => { self.errors.with_label_values(&[component]).inc(); } + ObserverEvent::HandStarted { hand_name } => { + self.hand_runs + .with_label_values(&[hand_name.as_str(), "true"]) + .inc_by(0); // touch the series so it appears in output + } + ObserverEvent::HandCompleted { + hand_name, + duration_ms, + findings_count, + } => { + self.hand_runs + .with_label_values(&[hand_name.as_str(), "true"]) + .inc(); + self.hand_duration + .with_label_values(&[hand_name.as_str()]) + .observe(*duration_ms as f64 / 1000.0); + self.hand_findings + .with_label_values(&[hand_name.as_str()]) + .inc_by(*findings_count as u64); + } + ObserverEvent::HandFailed { + hand_name, + duration_ms, + .. + } => { + self.hand_runs + .with_label_values(&[hand_name.as_str(), "false"]) + .inc(); + self.hand_duration + .with_label_values(&[hand_name.as_str()]) + .observe(*duration_ms as f64 / 1000.0); + } } } @@ -314,6 +382,25 @@ impl Observer for PrometheusObserver { .with_label_values(&[] as &[&str]) .set(*d as f64); } + ObserverMetric::HandRunDuration { + hand_name, + duration, + } => { + self.hand_duration + .with_label_values(&[hand_name.as_str()]) + .observe(duration.as_secs_f64()); + } + ObserverMetric::HandFindingsCount { hand_name, count } => { + self.hand_findings + .with_label_values(&[hand_name.as_str()]) + .inc_by(*count); + } + ObserverMetric::HandSuccessRate { hand_name, success } => { + let success_str = if *success { "true" } else { "false" }; + self.hand_runs + .with_label_values(&[hand_name.as_str(), success_str]) + .inc(); + } } } @@ -513,6 +600,61 @@ mod tests { )); } + #[test] + fn hand_events_track_runs_and_duration() { + let obs = PrometheusObserver::new(); + + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 2000, + findings_count: 1, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + + let output = obs.encode(); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="review",success="true"} 2"#)); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="review",success="false"} 1"#)); + assert!(output.contains(r#"zeroclaw_hand_findings_total{hand="review"} 4"#)); + assert!(output.contains("zeroclaw_hand_duration_seconds")); + } + + #[test] + fn hand_metrics_record_duration_and_findings() { + let obs = PrometheusObserver::new(); + + obs.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "scan".into(), + duration: Duration::from_millis(800), + }); + obs.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "scan".into(), + count: 5, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "scan".into(), + success: true, + }); + obs.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "scan".into(), + success: false, + }); + + let output = obs.encode(); + assert!(output.contains("zeroclaw_hand_duration_seconds")); + assert!(output.contains(r#"zeroclaw_hand_findings_total{hand="scan"} 5"#)); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="scan",success="true"} 1"#)); + assert!(output.contains(r#"zeroclaw_hand_runs_total{hand="scan",success="false"} 1"#)); + } + #[test] fn llm_response_without_tokens_increments_request_only() { let obs = PrometheusObserver::new(); diff --git a/src/observability/traits.rs b/src/observability/traits.rs index 3b68feafc..7d2443535 100644 --- a/src/observability/traits.rs +++ b/src/observability/traits.rs @@ -80,6 +80,20 @@ pub enum ObserverEvent { /// Human-readable error description. Must not contain secrets or tokens. message: String, }, + /// A hand has started execution. + HandStarted { hand_name: String }, + /// A hand has completed execution successfully. + HandCompleted { + hand_name: String, + duration_ms: u64, + findings_count: usize, + }, + /// A hand has failed during execution. + HandFailed { + hand_name: String, + error: String, + duration_ms: u64, + }, } /// Numeric metrics emitted by the agent runtime. @@ -96,6 +110,15 @@ pub enum ObserverMetric { ActiveSessions(u64), /// Current depth of the inbound message queue. QueueDepth(u64), + /// Duration of a single hand run. + HandRunDuration { + hand_name: String, + duration: Duration, + }, + /// Number of findings produced by a hand run. + HandFindingsCount { hand_name: String, count: u64 }, + /// Records a hand run outcome for success-rate tracking. + HandSuccessRate { hand_name: String, success: bool }, } /// Core observability trait for recording agent runtime telemetry. @@ -212,4 +235,67 @@ mod tests { assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. })); assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_))); } + + #[test] + fn hand_events_recordable() { + let observer = DummyObserver::default(); + + observer.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + observer.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + observer.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + + assert_eq!(*observer.events.lock(), 3); + } + + #[test] + fn hand_metrics_recordable() { + let observer = DummyObserver::default(); + + observer.record_metric(&ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(1500), + }); + observer.record_metric(&ObserverMetric::HandFindingsCount { + hand_name: "review".into(), + count: 3, + }); + observer.record_metric(&ObserverMetric::HandSuccessRate { + hand_name: "review".into(), + success: true, + }); + + assert_eq!(*observer.metrics.lock(), 3); + } + + #[test] + fn hand_event_and_metric_are_cloneable() { + let event = ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 500, + findings_count: 2, + }; + let metric = ObserverMetric::HandRunDuration { + hand_name: "review".into(), + duration: Duration::from_millis(500), + }; + + let cloned_event = event.clone(); + let cloned_metric = metric.clone(); + + assert!(matches!(cloned_event, ObserverEvent::HandCompleted { .. })); + assert!(matches!( + cloned_metric, + ObserverMetric::HandRunDuration { .. } + )); + } } diff --git a/src/observability/verbose.rs b/src/observability/verbose.rs index 12271c0f8..e15d2fd1b 100644 --- a/src/observability/verbose.rs +++ b/src/observability/verbose.rs @@ -101,4 +101,22 @@ mod tests { }); obs.record_event(&ObserverEvent::TurnComplete); } + + #[test] + fn verbose_hand_events_do_not_panic() { + let obs = VerboseObserver::new(); + obs.record_event(&ObserverEvent::HandStarted { + hand_name: "review".into(), + }); + obs.record_event(&ObserverEvent::HandCompleted { + hand_name: "review".into(), + duration_ms: 1500, + findings_count: 3, + }); + obs.record_event(&ObserverEvent::HandFailed { + hand_name: "review".into(), + error: "timeout".into(), + duration_ms: 5000, + }); + } }