bindy/
metrics.rs

1// Copyright (c) 2025 Erick Bourgeois, firestoned
2// SPDX-License-Identifier: MIT
3
4//! Prometheus metrics for the Bindy DNS operator.
5//!
6//! This module provides comprehensive metrics collection with the namespace prefix
7//! `bindy_firestoned_io_` (prometheus-safe version of "bindy.firestoned.io").
8//!
9//! # Metrics Categories
10//!
11//! - **Reconciliation Metrics** - Track reconciliation operations and their outcomes
12//! - **Resource Lifecycle Metrics** - Track resource creation, updates, and deletions
13//! - **Error Metrics** - Track error conditions and types
14//! - **Leader Election Metrics** - Track leadership state changes
15//! - **Performance Metrics** - Track duration and latency
16//!
17//! # Example
18//!
19//! ```rust,no_run
20//! use bindy::metrics::{METRICS_REGISTRY, record_reconciliation_success};
21//!
22//! // Record a successful reconciliation
23//! record_reconciliation_success("DNSZone", std::time::Duration::from_secs(1));
24//! ```
25
26use prometheus::{
27    CounterVec, Encoder, GaugeVec, HistogramOpts, HistogramVec, Opts, Registry, TextEncoder,
28};
29use std::sync::LazyLock;
30use std::time::Duration;
31
32// ============================================================================
33// Metric Name Constants
34// ============================================================================
35
36/// Namespace prefix for all Bindy metrics (prometheus-safe)
37const METRICS_NAMESPACE: &str = "bindy_firestoned_io";
38
39// ============================================================================
40// Global Metrics Registry
41// ============================================================================
42
43/// Global Prometheus metrics registry
44///
45/// All metrics are registered in this registry and exposed via `/metrics` endpoint.
46pub static METRICS_REGISTRY: LazyLock<Registry> = LazyLock::new(Registry::new);
47
48// ============================================================================
49// Reconciliation Metrics
50// ============================================================================
51
52/// Total number of reconciliations by resource type and status
53///
54/// Labels:
55/// - `resource_type`: Kind of resource (e.g., `DNSZone`, `ARecord`)
56/// - `status`: Outcome (`success`, `error`, `requeue`)
57pub static RECONCILIATION_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
58    let opts = Opts::new(
59        format!("{METRICS_NAMESPACE}_reconciliations_total"),
60        "Total number of reconciliations by resource type and status",
61    );
62    let counter = CounterVec::new(opts, &["resource_type", "status"]).unwrap();
63    METRICS_REGISTRY
64        .register(Box::new(counter.clone()))
65        .unwrap();
66    counter
67});
68
69/// Duration of reconciliations in seconds
70///
71/// Labels:
72/// - `resource_type`: Kind of resource (e.g., `DNSZone`, `ARecord`)
73pub static RECONCILIATION_DURATION_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
74    let opts = HistogramOpts::new(
75        format!("{METRICS_NAMESPACE}_reconciliation_duration_seconds"),
76        "Duration of reconciliations in seconds by resource type",
77    )
78    .buckets(vec![0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0]);
79    let histogram = HistogramVec::new(opts, &["resource_type"]).unwrap();
80    METRICS_REGISTRY
81        .register(Box::new(histogram.clone()))
82        .unwrap();
83    histogram
84});
85
86/// Total number of requeue operations
87///
88/// Labels:
89/// - `resource_type`: Kind of resource
90/// - `reason`: Reason for requeue (`error`, `rate_limit`, `dependency_wait`)
91pub static REQUEUE_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
92    let opts = Opts::new(
93        format!("{METRICS_NAMESPACE}_requeues_total"),
94        "Total number of requeue operations by resource type and reason",
95    );
96    let counter = CounterVec::new(opts, &["resource_type", "reason"]).unwrap();
97    METRICS_REGISTRY
98        .register(Box::new(counter.clone()))
99        .unwrap();
100    counter
101});
102
103// ============================================================================
104// Resource Lifecycle Metrics
105// ============================================================================
106
107/// Total number of resources created
108///
109/// Labels:
110/// - `resource_type`: Kind of resource created
111pub static RESOURCES_CREATED_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
112    let opts = Opts::new(
113        format!("{METRICS_NAMESPACE}_resources_created_total"),
114        "Total number of resources created by type",
115    );
116    let counter = CounterVec::new(opts, &["resource_type"]).unwrap();
117    METRICS_REGISTRY
118        .register(Box::new(counter.clone()))
119        .unwrap();
120    counter
121});
122
123/// Total number of resources updated
124///
125/// Labels:
126/// - `resource_type`: Kind of resource updated
127pub static RESOURCES_UPDATED_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
128    let opts = Opts::new(
129        format!("{METRICS_NAMESPACE}_resources_updated_total"),
130        "Total number of resources updated by type",
131    );
132    let counter = CounterVec::new(opts, &["resource_type"]).unwrap();
133    METRICS_REGISTRY
134        .register(Box::new(counter.clone()))
135        .unwrap();
136    counter
137});
138
139/// Total number of resources deleted
140///
141/// Labels:
142/// - `resource_type`: Kind of resource deleted
143pub static RESOURCES_DELETED_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
144    let opts = Opts::new(
145        format!("{METRICS_NAMESPACE}_resources_deleted_total"),
146        "Total number of resources deleted by type",
147    );
148    let counter = CounterVec::new(opts, &["resource_type"]).unwrap();
149    METRICS_REGISTRY
150        .register(Box::new(counter.clone()))
151        .unwrap();
152    counter
153});
154
155/// Number of currently active resources being tracked
156///
157/// Labels:
158/// - `resource_type`: Kind of resource
159pub static RESOURCES_ACTIVE: LazyLock<GaugeVec> = LazyLock::new(|| {
160    let opts = Opts::new(
161        format!("{METRICS_NAMESPACE}_resources_active"),
162        "Number of currently active resources by type",
163    );
164    let gauge = GaugeVec::new(opts, &["resource_type"]).unwrap();
165    METRICS_REGISTRY.register(Box::new(gauge.clone())).unwrap();
166    gauge
167});
168
169// ============================================================================
170// Error Metrics
171// ============================================================================
172
173/// Total number of errors by resource type and error category
174///
175/// Labels:
176/// - `resource_type`: Kind of resource
177/// - `error_type`: Category of error (`api_error`, `validation_error`, `network_error`, `timeout`)
178pub static ERRORS_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
179    let opts = Opts::new(
180        format!("{METRICS_NAMESPACE}_errors_total"),
181        "Total number of errors by resource type and error category",
182    );
183    let counter = CounterVec::new(opts, &["resource_type", "error_type"]).unwrap();
184    METRICS_REGISTRY
185        .register(Box::new(counter.clone()))
186        .unwrap();
187    counter
188});
189
190// ============================================================================
191// Leader Election Metrics
192// ============================================================================
193
194/// Total number of leader election events
195///
196/// Labels:
197/// - `status`: Event type (`acquired`, `lost`, `renewed`)
198pub static LEADER_ELECTIONS_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
199    let opts = Opts::new(
200        format!("{METRICS_NAMESPACE}_leader_elections_total"),
201        "Total number of leader election events by status",
202    );
203    let counter = CounterVec::new(opts, &["status"]).unwrap();
204    METRICS_REGISTRY
205        .register(Box::new(counter.clone()))
206        .unwrap();
207    counter
208});
209
210/// Current leader election status
211///
212/// Labels:
213/// - `pod_name`: Name of the pod
214///
215/// Value: 1 if leader, 0 if follower
216pub static LEADER_STATUS: LazyLock<GaugeVec> = LazyLock::new(|| {
217    let opts = Opts::new(
218        format!("{METRICS_NAMESPACE}_leader_status"),
219        "Current leader election status (1 = leader, 0 = follower)",
220    );
221    let gauge = GaugeVec::new(opts, &["pod_name"]).unwrap();
222    METRICS_REGISTRY.register(Box::new(gauge.clone())).unwrap();
223    gauge
224});
225
226// ============================================================================
227// Performance Metrics
228// ============================================================================
229
230/// Lag between resource generation change and observation
231///
232/// Labels:
233/// - `resource_type`: Kind of resource
234pub static GENERATION_OBSERVATION_LAG_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
235    let opts = HistogramOpts::new(
236        format!("{METRICS_NAMESPACE}_generation_observation_lag_seconds"),
237        "Lag between spec generation change and controller observation",
238    )
239    .buckets(vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]);
240    let histogram = HistogramVec::new(opts, &["resource_type"]).unwrap();
241    METRICS_REGISTRY
242        .register(Box::new(histogram.clone()))
243        .unwrap();
244    histogram
245});
246
247// ============================================================================
248// Helper Functions
249// ============================================================================
250
251/// Record a successful reconciliation
252///
253/// # Arguments
254/// * `resource_type` - The kind of resource reconciled (e.g., `DNSZone`)
255/// * `duration` - Duration of the reconciliation
256pub fn record_reconciliation_success(resource_type: &str, duration: Duration) {
257    RECONCILIATION_TOTAL
258        .with_label_values(&[resource_type, "success"])
259        .inc();
260    RECONCILIATION_DURATION_SECONDS
261        .with_label_values(&[resource_type])
262        .observe(duration.as_secs_f64());
263}
264
265/// Record a failed reconciliation
266///
267/// # Arguments
268/// * `resource_type` - The kind of resource reconciled
269/// * `duration` - Duration of the reconciliation before failure
270pub fn record_reconciliation_error(resource_type: &str, duration: Duration) {
271    RECONCILIATION_TOTAL
272        .with_label_values(&[resource_type, "error"])
273        .inc();
274    RECONCILIATION_DURATION_SECONDS
275        .with_label_values(&[resource_type])
276        .observe(duration.as_secs_f64());
277}
278
279/// Record a reconciliation requeue
280///
281/// # Arguments
282/// * `resource_type` - The kind of resource reconciled
283/// * `reason` - Reason for requeue (e.g., `error`, `rate_limit`)
284pub fn record_reconciliation_requeue(resource_type: &str, reason: &str) {
285    RECONCILIATION_TOTAL
286        .with_label_values(&[resource_type, "requeue"])
287        .inc();
288    REQUEUE_TOTAL
289        .with_label_values(&[resource_type, reason])
290        .inc();
291}
292
293/// Record resource creation
294///
295/// # Arguments
296/// * `resource_type` - The kind of resource created
297pub fn record_resource_created(resource_type: &str) {
298    RESOURCES_CREATED_TOTAL
299        .with_label_values(&[resource_type])
300        .inc();
301    RESOURCES_ACTIVE.with_label_values(&[resource_type]).inc();
302}
303
304/// Record resource update
305///
306/// # Arguments
307/// * `resource_type` - The kind of resource updated
308pub fn record_resource_updated(resource_type: &str) {
309    RESOURCES_UPDATED_TOTAL
310        .with_label_values(&[resource_type])
311        .inc();
312}
313
314/// Record resource deletion
315///
316/// # Arguments
317/// * `resource_type` - The kind of resource deleted
318pub fn record_resource_deleted(resource_type: &str) {
319    RESOURCES_DELETED_TOTAL
320        .with_label_values(&[resource_type])
321        .inc();
322    RESOURCES_ACTIVE.with_label_values(&[resource_type]).dec();
323}
324
325/// Record an error
326///
327/// # Arguments
328/// * `resource_type` - The kind of resource where error occurred
329/// * `error_type` - Category of error (e.g., `api_error`, `validation_error`)
330pub fn record_error(resource_type: &str, error_type: &str) {
331    ERRORS_TOTAL
332        .with_label_values(&[resource_type, error_type])
333        .inc();
334}
335
336/// Record leader election acquired
337///
338/// # Arguments
339/// * `pod_name` - Name of the pod that acquired leadership
340pub fn record_leader_elected(pod_name: &str) {
341    LEADER_ELECTIONS_TOTAL
342        .with_label_values(&["acquired"])
343        .inc();
344    LEADER_STATUS.with_label_values(&[pod_name]).set(1.0);
345}
346
347/// Record leader election lost
348///
349/// # Arguments
350/// * `pod_name` - Name of the pod that lost leadership
351pub fn record_leader_lost(pod_name: &str) {
352    LEADER_ELECTIONS_TOTAL.with_label_values(&["lost"]).inc();
353    LEADER_STATUS.with_label_values(&[pod_name]).set(0.0);
354}
355
356/// Record leader election renewed
357pub fn record_leader_renewed() {
358    LEADER_ELECTIONS_TOTAL.with_label_values(&["renewed"]).inc();
359}
360
361/// Record generation observation lag
362///
363/// # Arguments
364/// * `resource_type` - The kind of resource
365/// * `lag` - Duration between generation change and observation
366pub fn record_generation_lag(resource_type: &str, lag: Duration) {
367    GENERATION_OBSERVATION_LAG_SECONDS
368        .with_label_values(&[resource_type])
369        .observe(lag.as_secs_f64());
370}
371
372/// Gather and encode all metrics in Prometheus text format
373///
374/// # Returns
375/// Prometheus-formatted metrics as a String
376///
377/// # Errors
378/// Returns error if encoding fails
379pub fn gather_metrics() -> Result<String, prometheus::Error> {
380    let encoder = TextEncoder::new();
381    let metric_families = METRICS_REGISTRY.gather();
382    let mut buffer = Vec::new();
383    encoder.encode(&metric_families, &mut buffer)?;
384    String::from_utf8(buffer).map_err(|e| prometheus::Error::Msg(format!("UTF-8 error: {e}")))
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390
391    #[test]
392    fn test_record_reconciliation_success() {
393        let resource_type = "TestResource";
394        let duration = Duration::from_millis(500);
395
396        // Record success
397        record_reconciliation_success(resource_type, duration);
398
399        // Verify counter incremented
400        let counter = RECONCILIATION_TOTAL.with_label_values(&[resource_type, "success"]);
401        assert!(counter.get() > 0.0);
402
403        // Verify histogram recorded
404        let histogram = RECONCILIATION_DURATION_SECONDS.with_label_values(&[resource_type]);
405        assert!(histogram.get_sample_count() > 0);
406    }
407
408    #[test]
409    fn test_record_reconciliation_error() {
410        let resource_type = "TestResourceError";
411        let duration = Duration::from_millis(250);
412
413        // Record error
414        record_reconciliation_error(resource_type, duration);
415
416        // Verify counter incremented
417        let counter = RECONCILIATION_TOTAL.with_label_values(&[resource_type, "error"]);
418        assert!(counter.get() > 0.0);
419
420        // Verify histogram recorded
421        let histogram = RECONCILIATION_DURATION_SECONDS.with_label_values(&[resource_type]);
422        assert!(histogram.get_sample_count() > 0);
423    }
424
425    #[test]
426    fn test_gather_metrics() {
427        // Record some metrics to initialize them
428        record_reconciliation_success("GatherTest", Duration::from_millis(100));
429
430        // Gather metrics
431        let result = gather_metrics();
432        assert!(result.is_ok(), "Gathering metrics should succeed");
433
434        let metrics_text = result.unwrap();
435        assert!(
436            metrics_text.contains("bindy_firestoned_io"),
437            "Metrics should contain namespace prefix"
438        );
439        assert!(
440            metrics_text.contains("reconciliations_total"),
441            "Metrics should contain reconciliation counter"
442        );
443    }
444}