1use prometheus::{
27 CounterVec, Encoder, GaugeVec, HistogramOpts, HistogramVec, Opts, Registry, TextEncoder,
28};
29use std::sync::LazyLock;
30use std::time::Duration;
31
32const METRICS_NAMESPACE: &str = "bindy_firestoned_io";
38
39pub static METRICS_REGISTRY: LazyLock<Registry> = LazyLock::new(Registry::new);
47
48pub static RECONCILIATION_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
58 let opts = Opts::new(
59 format!("{METRICS_NAMESPACE}_reconciliations_total"),
60 "Total number of reconciliations by resource type and status",
61 );
62 let counter = CounterVec::new(opts, &["resource_type", "status"]).unwrap();
63 METRICS_REGISTRY
64 .register(Box::new(counter.clone()))
65 .unwrap();
66 counter
67});
68
69pub static RECONCILIATION_DURATION_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
74 let opts = HistogramOpts::new(
75 format!("{METRICS_NAMESPACE}_reconciliation_duration_seconds"),
76 "Duration of reconciliations in seconds by resource type",
77 )
78 .buckets(vec![0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0]);
79 let histogram = HistogramVec::new(opts, &["resource_type"]).unwrap();
80 METRICS_REGISTRY
81 .register(Box::new(histogram.clone()))
82 .unwrap();
83 histogram
84});
85
86pub static REQUEUE_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
92 let opts = Opts::new(
93 format!("{METRICS_NAMESPACE}_requeues_total"),
94 "Total number of requeue operations by resource type and reason",
95 );
96 let counter = CounterVec::new(opts, &["resource_type", "reason"]).unwrap();
97 METRICS_REGISTRY
98 .register(Box::new(counter.clone()))
99 .unwrap();
100 counter
101});
102
103pub static RESOURCES_CREATED_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
112 let opts = Opts::new(
113 format!("{METRICS_NAMESPACE}_resources_created_total"),
114 "Total number of resources created by type",
115 );
116 let counter = CounterVec::new(opts, &["resource_type"]).unwrap();
117 METRICS_REGISTRY
118 .register(Box::new(counter.clone()))
119 .unwrap();
120 counter
121});
122
123pub static RESOURCES_UPDATED_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
128 let opts = Opts::new(
129 format!("{METRICS_NAMESPACE}_resources_updated_total"),
130 "Total number of resources updated by type",
131 );
132 let counter = CounterVec::new(opts, &["resource_type"]).unwrap();
133 METRICS_REGISTRY
134 .register(Box::new(counter.clone()))
135 .unwrap();
136 counter
137});
138
139pub static RESOURCES_DELETED_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
144 let opts = Opts::new(
145 format!("{METRICS_NAMESPACE}_resources_deleted_total"),
146 "Total number of resources deleted by type",
147 );
148 let counter = CounterVec::new(opts, &["resource_type"]).unwrap();
149 METRICS_REGISTRY
150 .register(Box::new(counter.clone()))
151 .unwrap();
152 counter
153});
154
155pub static RESOURCES_ACTIVE: LazyLock<GaugeVec> = LazyLock::new(|| {
160 let opts = Opts::new(
161 format!("{METRICS_NAMESPACE}_resources_active"),
162 "Number of currently active resources by type",
163 );
164 let gauge = GaugeVec::new(opts, &["resource_type"]).unwrap();
165 METRICS_REGISTRY.register(Box::new(gauge.clone())).unwrap();
166 gauge
167});
168
169pub static ERRORS_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
179 let opts = Opts::new(
180 format!("{METRICS_NAMESPACE}_errors_total"),
181 "Total number of errors by resource type and error category",
182 );
183 let counter = CounterVec::new(opts, &["resource_type", "error_type"]).unwrap();
184 METRICS_REGISTRY
185 .register(Box::new(counter.clone()))
186 .unwrap();
187 counter
188});
189
190pub static LEADER_ELECTIONS_TOTAL: LazyLock<CounterVec> = LazyLock::new(|| {
199 let opts = Opts::new(
200 format!("{METRICS_NAMESPACE}_leader_elections_total"),
201 "Total number of leader election events by status",
202 );
203 let counter = CounterVec::new(opts, &["status"]).unwrap();
204 METRICS_REGISTRY
205 .register(Box::new(counter.clone()))
206 .unwrap();
207 counter
208});
209
210pub static LEADER_STATUS: LazyLock<GaugeVec> = LazyLock::new(|| {
217 let opts = Opts::new(
218 format!("{METRICS_NAMESPACE}_leader_status"),
219 "Current leader election status (1 = leader, 0 = follower)",
220 );
221 let gauge = GaugeVec::new(opts, &["pod_name"]).unwrap();
222 METRICS_REGISTRY.register(Box::new(gauge.clone())).unwrap();
223 gauge
224});
225
226pub static GENERATION_OBSERVATION_LAG_SECONDS: LazyLock<HistogramVec> = LazyLock::new(|| {
235 let opts = HistogramOpts::new(
236 format!("{METRICS_NAMESPACE}_generation_observation_lag_seconds"),
237 "Lag between spec generation change and controller observation",
238 )
239 .buckets(vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0]);
240 let histogram = HistogramVec::new(opts, &["resource_type"]).unwrap();
241 METRICS_REGISTRY
242 .register(Box::new(histogram.clone()))
243 .unwrap();
244 histogram
245});
246
247pub fn record_reconciliation_success(resource_type: &str, duration: Duration) {
257 RECONCILIATION_TOTAL
258 .with_label_values(&[resource_type, "success"])
259 .inc();
260 RECONCILIATION_DURATION_SECONDS
261 .with_label_values(&[resource_type])
262 .observe(duration.as_secs_f64());
263}
264
265pub fn record_reconciliation_error(resource_type: &str, duration: Duration) {
271 RECONCILIATION_TOTAL
272 .with_label_values(&[resource_type, "error"])
273 .inc();
274 RECONCILIATION_DURATION_SECONDS
275 .with_label_values(&[resource_type])
276 .observe(duration.as_secs_f64());
277}
278
279pub fn record_reconciliation_requeue(resource_type: &str, reason: &str) {
285 RECONCILIATION_TOTAL
286 .with_label_values(&[resource_type, "requeue"])
287 .inc();
288 REQUEUE_TOTAL
289 .with_label_values(&[resource_type, reason])
290 .inc();
291}
292
293pub fn record_resource_created(resource_type: &str) {
298 RESOURCES_CREATED_TOTAL
299 .with_label_values(&[resource_type])
300 .inc();
301 RESOURCES_ACTIVE.with_label_values(&[resource_type]).inc();
302}
303
304pub fn record_resource_updated(resource_type: &str) {
309 RESOURCES_UPDATED_TOTAL
310 .with_label_values(&[resource_type])
311 .inc();
312}
313
314pub fn record_resource_deleted(resource_type: &str) {
319 RESOURCES_DELETED_TOTAL
320 .with_label_values(&[resource_type])
321 .inc();
322 RESOURCES_ACTIVE.with_label_values(&[resource_type]).dec();
323}
324
325pub fn record_error(resource_type: &str, error_type: &str) {
331 ERRORS_TOTAL
332 .with_label_values(&[resource_type, error_type])
333 .inc();
334}
335
336pub fn record_leader_elected(pod_name: &str) {
341 LEADER_ELECTIONS_TOTAL
342 .with_label_values(&["acquired"])
343 .inc();
344 LEADER_STATUS.with_label_values(&[pod_name]).set(1.0);
345}
346
347pub fn record_leader_lost(pod_name: &str) {
352 LEADER_ELECTIONS_TOTAL.with_label_values(&["lost"]).inc();
353 LEADER_STATUS.with_label_values(&[pod_name]).set(0.0);
354}
355
356pub fn record_leader_renewed() {
358 LEADER_ELECTIONS_TOTAL.with_label_values(&["renewed"]).inc();
359}
360
361pub fn record_generation_lag(resource_type: &str, lag: Duration) {
367 GENERATION_OBSERVATION_LAG_SECONDS
368 .with_label_values(&[resource_type])
369 .observe(lag.as_secs_f64());
370}
371
372pub fn gather_metrics() -> Result<String, prometheus::Error> {
380 let encoder = TextEncoder::new();
381 let metric_families = METRICS_REGISTRY.gather();
382 let mut buffer = Vec::new();
383 encoder.encode(&metric_families, &mut buffer)?;
384 String::from_utf8(buffer).map_err(|e| prometheus::Error::Msg(format!("UTF-8 error: {e}")))
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390
391 #[test]
392 fn test_record_reconciliation_success() {
393 let resource_type = "TestResource";
394 let duration = Duration::from_millis(500);
395
396 record_reconciliation_success(resource_type, duration);
398
399 let counter = RECONCILIATION_TOTAL.with_label_values(&[resource_type, "success"]);
401 assert!(counter.get() > 0.0);
402
403 let histogram = RECONCILIATION_DURATION_SECONDS.with_label_values(&[resource_type]);
405 assert!(histogram.get_sample_count() > 0);
406 }
407
408 #[test]
409 fn test_record_reconciliation_error() {
410 let resource_type = "TestResourceError";
411 let duration = Duration::from_millis(250);
412
413 record_reconciliation_error(resource_type, duration);
415
416 let counter = RECONCILIATION_TOTAL.with_label_values(&[resource_type, "error"]);
418 assert!(counter.get() > 0.0);
419
420 let histogram = RECONCILIATION_DURATION_SECONDS.with_label_values(&[resource_type]);
422 assert!(histogram.get_sample_count() > 0);
423 }
424
425 #[test]
426 fn test_gather_metrics() {
427 record_reconciliation_success("GatherTest", Duration::from_millis(100));
429
430 let result = gather_metrics();
432 assert!(result.is_ok(), "Gathering metrics should succeed");
433
434 let metrics_text = result.unwrap();
435 assert!(
436 metrics_text.contains("bindy_firestoned_io"),
437 "Metrics should contain namespace prefix"
438 );
439 assert!(
440 metrics_text.contains("reconciliations_total"),
441 "Metrics should contain reconciliation counter"
442 );
443 }
444}