bindy/reconcilers/
retry.rs

1// Copyright (c) 2025 Erick Bourgeois, firestoned
2// SPDX-License-Identifier: MIT
3
4//! Retry logic with exponential backoff for Kubernetes API calls.
5//!
6//! This module provides utilities for retrying transient API errors (429, 5xx)
7//! with exponential backoff, while failing fast on permanent errors (4xx client errors).
8
9use anyhow::Result;
10use rand::RngExt;
11use reqwest::StatusCode;
12use std::time::{Duration, Instant};
13use tracing::{debug, error, warn};
14
15/// Maximum total time to spend retrying (5 minutes)
16const MAX_ELAPSED_TIME_SECS: u64 = 300;
17
18/// Initial retry interval (100ms)
19const INITIAL_INTERVAL_MILLIS: u64 = 100;
20
21/// Maximum interval between retries (30 seconds)
22const MAX_INTERVAL_SECS: u64 = 30;
23
24/// Backoff multiplier (exponential growth factor)
25const BACKOFF_MULTIPLIER: f64 = 2.0;
26
27/// Randomization factor to prevent thundering herd (±10%)
28const RANDOMIZATION_FACTOR: f64 = 0.1;
29
30/// HTTP retry initial interval (50ms) - faster than Kubernetes API
31const HTTP_INITIAL_INTERVAL_MILLIS: u64 = 50;
32
33/// HTTP retry maximum interval (10 seconds) - shorter than Kubernetes API
34const HTTP_MAX_INTERVAL_SECS: u64 = 10;
35
36/// HTTP retry maximum elapsed time (2 minutes) - shorter than Kubernetes API
37const HTTP_MAX_ELAPSED_TIME_SECS: u64 = 120;
38
39/// Simple exponential backoff implementation.
40///
41/// Provides exponential backoff with randomization (jitter) to prevent thundering herd.
42pub struct ExponentialBackoff {
43    /// Current interval duration
44    pub current_interval: Duration,
45    /// Initial interval duration (stored for potential reset functionality)
46    #[allow(dead_code)]
47    pub initial_interval: Duration,
48    /// Maximum interval duration
49    pub max_interval: Duration,
50    /// Maximum total elapsed time
51    pub max_elapsed_time: Option<Duration>,
52    /// Backoff multiplier (typically 2.0 for doubling)
53    pub multiplier: f64,
54    /// Randomization factor (e.g., 0.1 for ±10%)
55    pub randomization_factor: f64,
56    /// Start time for tracking total elapsed time
57    start_time: Instant,
58}
59
60impl ExponentialBackoff {
61    /// Create a new exponential backoff with specified parameters.
62    fn new(
63        initial_interval: Duration,
64        max_interval: Duration,
65        max_elapsed_time: Option<Duration>,
66        multiplier: f64,
67        randomization_factor: f64,
68    ) -> Self {
69        Self {
70            current_interval: initial_interval,
71            initial_interval,
72            max_interval,
73            max_elapsed_time,
74            multiplier,
75            randomization_factor,
76            start_time: Instant::now(),
77        }
78    }
79
80    /// Get the next backoff interval, or None if max elapsed time exceeded.
81    pub fn next_backoff(&mut self) -> Option<Duration> {
82        // Check if we've exceeded max elapsed time
83        if let Some(max_elapsed) = self.max_elapsed_time {
84            if self.start_time.elapsed() >= max_elapsed {
85                return None;
86            }
87        }
88
89        // Get current interval with jitter
90        let interval = self.current_interval;
91        let jittered = self.apply_jitter(interval);
92
93        // Calculate next interval (exponential growth)
94        let next = interval.as_secs_f64() * self.multiplier;
95        self.current_interval = Duration::from_secs_f64(next).min(self.max_interval);
96
97        Some(jittered)
98    }
99
100    /// Apply randomization (jitter) to an interval.
101    fn apply_jitter(&self, interval: Duration) -> Duration {
102        if self.randomization_factor == 0.0 {
103            return interval;
104        }
105
106        let secs = interval.as_secs_f64();
107        let delta = secs * self.randomization_factor;
108        let min = secs - delta;
109        let max = secs + delta;
110
111        let mut rng = rand::rng();
112        let jittered = rng.random_range(min..=max);
113
114        Duration::from_secs_f64(jittered.max(0.0))
115    }
116}
117
118/// Create default exponential backoff configuration for Kubernetes API retries.
119///
120/// # Configuration
121///
122/// - **Initial interval**: 100ms
123/// - **Max interval**: 30 seconds
124/// - **Max elapsed time**: 5 minutes total
125/// - **Multiplier**: 2.0 (exponential growth)
126/// - **Randomization**: ±10% (prevents thundering herd)
127///
128/// # Retry Schedule
129///
130/// With these settings, retries occur at approximately:
131///
132/// 1. 100ms
133/// 2. 200ms
134/// 3. 400ms
135/// 4. 800ms
136/// 5. 1.6s
137/// 6. 3.2s
138/// 7. 6.4s
139/// 8. 12.8s
140/// 9. 25.6s
141/// 10. 30s (capped at max interval)
142///     11-30. 30s intervals until 5 minutes elapsed
143///
144/// # Returns
145///
146/// Configured `ExponentialBackoff` instance
147#[must_use]
148pub fn default_backoff() -> ExponentialBackoff {
149    ExponentialBackoff::new(
150        Duration::from_millis(INITIAL_INTERVAL_MILLIS),
151        Duration::from_secs(MAX_INTERVAL_SECS),
152        Some(Duration::from_secs(MAX_ELAPSED_TIME_SECS)),
153        BACKOFF_MULTIPLIER,
154        RANDOMIZATION_FACTOR,
155    )
156}
157
158/// Create exponential backoff configuration for HTTP API retries.
159///
160/// HTTP API calls (e.g., bindcar sidecar) use faster retry cycles than Kubernetes API
161/// since they target local/nearby services that should fail fast.
162///
163/// # Configuration
164///
165/// - **Initial interval**: 50ms
166/// - **Max interval**: 10 seconds
167/// - **Max elapsed time**: 2 minutes total
168/// - **Multiplier**: 2.0 (exponential growth)
169/// - **Randomization**: ±10% (prevents thundering herd)
170///
171/// # Retry Schedule
172///
173/// With these settings, retries occur at approximately:
174///
175/// 1. 50ms
176/// 2. 100ms
177/// 3. 200ms
178/// 4. 400ms
179/// 5. 800ms
180/// 6. 1.6s
181/// 7. 3.2s
182/// 8. 6.4s
183/// 9. 10s (capped at max interval)
184///    10-12. 10s intervals until 2 minutes elapsed
185///
186/// # Returns
187///
188/// Configured `ExponentialBackoff` instance
189#[must_use]
190pub fn http_backoff() -> ExponentialBackoff {
191    ExponentialBackoff::new(
192        Duration::from_millis(HTTP_INITIAL_INTERVAL_MILLIS),
193        Duration::from_secs(HTTP_MAX_INTERVAL_SECS),
194        Some(Duration::from_secs(HTTP_MAX_ELAPSED_TIME_SECS)),
195        BACKOFF_MULTIPLIER,
196        RANDOMIZATION_FACTOR,
197    )
198}
199
200/// Determine if an HTTP status code is retryable.
201///
202/// # Retryable Status Codes
203///
204/// - **429** (Too Many Requests) - Rate limiting
205/// - **500** (Internal Server Error) - Server error
206/// - **502** (Bad Gateway) - Proxy/gateway error
207/// - **503** (Service Unavailable) - Temporary unavailability
208/// - **504** (Gateway Timeout) - Gateway timeout
209///
210/// # Arguments
211///
212/// * `status` - The HTTP status code to check
213///
214/// # Returns
215///
216/// `true` if the status code indicates a transient error, `false` otherwise
217#[must_use]
218pub fn is_retryable_http_status(status: StatusCode) -> bool {
219    matches!(
220        status,
221        StatusCode::TOO_MANY_REQUESTS
222            | StatusCode::INTERNAL_SERVER_ERROR
223            | StatusCode::BAD_GATEWAY
224            | StatusCode::SERVICE_UNAVAILABLE
225            | StatusCode::GATEWAY_TIMEOUT
226    )
227}
228
229/// Retry a Kubernetes API call with exponential backoff.
230///
231/// Automatically retries on transient errors (HTTP 429, 5xx) and fails immediately
232/// on permanent errors (4xx client errors except 429).
233///
234/// # Arguments
235///
236/// * `operation` - Async function that performs the API call
237/// * `operation_name` - Human-readable name for logging (e.g., "get cluster")
238///
239/// # Returns
240///
241/// Result of the API call after retries
242///
243/// # Errors
244///
245/// Returns error if:
246/// - Non-retryable error encountered (4xx client error)
247/// - Max elapsed time exceeded (5 minutes)
248/// - All retries exhausted
249///
250/// # Example
251///
252/// ```no_run
253/// use kube::{Api, Client};
254/// use bindy::crd::Bind9Cluster;
255/// use bindy::reconcilers::retry::retry_api_call;
256///
257/// # async fn example() -> anyhow::Result<()> {
258/// let client = Client::try_default().await?;
259/// let api: Api<Bind9Cluster> = Api::namespaced(client, "default");
260///
261/// let cluster = retry_api_call(
262///     || async { api.get("my-cluster").await.map_err(Into::into) },
263///     "get cluster my-cluster"
264/// ).await?;
265/// # Ok(())
266/// # }
267/// ```
268pub async fn retry_api_call<T, F, Fut>(mut operation: F, operation_name: &str) -> Result<T>
269where
270    F: FnMut() -> Fut,
271    Fut: std::future::Future<Output = Result<T, kube::Error>>,
272{
273    let mut backoff = default_backoff();
274    let start_time = Instant::now();
275    let mut attempt = 0;
276
277    loop {
278        attempt += 1;
279
280        let result = operation().await;
281
282        match result {
283            Ok(value) => {
284                if attempt > 1 {
285                    debug!(
286                        operation = operation_name,
287                        attempt = attempt,
288                        elapsed = ?start_time.elapsed(),
289                        "Kubernetes API call succeeded after retries"
290                    );
291                } else {
292                    debug!(operation = operation_name, "Kubernetes API call succeeded");
293                }
294                return Ok(value);
295            }
296            Err(e) => {
297                // Check if error is retryable
298                if !is_retryable_error(&e) {
299                    error!(
300                        operation = operation_name,
301                        error = %e,
302                        "Non-retryable Kubernetes API error, failing immediately"
303                    );
304                    return Err(e.into());
305                }
306
307                // Check if we've exceeded max elapsed time
308                if let Some(max_elapsed) = backoff.max_elapsed_time {
309                    if start_time.elapsed() >= max_elapsed {
310                        error!(
311                            operation = operation_name,
312                            attempt = attempt,
313                            elapsed = ?start_time.elapsed(),
314                            error = %e,
315                            "Max retry time exceeded, giving up"
316                        );
317                        return Err(anyhow::anyhow!(
318                            "Max retry time exceeded after {attempt} attempts: {e}"
319                        ));
320                    }
321                }
322
323                // Calculate next backoff interval
324                if let Some(duration) = backoff.next_backoff() {
325                    warn!(
326                        operation = operation_name,
327                        attempt = attempt,
328                        retry_after = ?duration,
329                        error = %e,
330                        "Retryable Kubernetes API error, will retry"
331                    );
332                    tokio::time::sleep(duration).await;
333                } else {
334                    error!(
335                        operation = operation_name,
336                        attempt = attempt,
337                        elapsed = ?start_time.elapsed(),
338                        error = %e,
339                        "Backoff exhausted, giving up"
340                    );
341                    return Err(anyhow::anyhow!(
342                        "Backoff exhausted after {attempt} attempts: {e}"
343                    ));
344                }
345            }
346        }
347    }
348}
349
350/// Determine if a Kubernetes error is retryable.
351///
352/// # Retryable Errors
353///
354/// - **HTTP 429** (Too Many Requests) - Rate limiting
355/// - **HTTP 5xx** (Server Errors) - Temporary API server issues
356/// - **Service Errors** - Network/connection issues
357///
358/// # Non-Retryable Errors
359///
360/// - **HTTP 4xx** (Client Errors, except 429) - Invalid request, not found, unauthorized, etc.
361/// - **Invalid Request** - Malformed data, schema violations
362///
363/// # Arguments
364///
365/// * `err` - The Kubernetes API error to check
366///
367/// # Returns
368///
369/// `true` if the error is transient and should be retried, `false` otherwise
370fn is_retryable_error(err: &kube::Error) -> bool {
371    match err {
372        kube::Error::Api(api_err) => {
373            // Retry on rate limiting (429) and server errors (5xx)
374            api_err.code == 429 || (api_err.code >= 500 && api_err.code < 600)
375        }
376        kube::Error::Service(_) => {
377            // Network/connection errors are retryable
378            true
379        }
380        _ => {
381            // Client errors (invalid request, not found, etc.) are not retryable
382            false
383        }
384    }
385}
386
387#[cfg(test)]
388#[path = "retry_tests.rs"]
389mod retry_tests;