bindy/reconcilers/retry.rs
1// Copyright (c) 2025 Erick Bourgeois, firestoned
2// SPDX-License-Identifier: MIT
3
4//! Retry logic with exponential backoff for Kubernetes API calls.
5//!
6//! This module provides utilities for retrying transient API errors (429, 5xx)
7//! with exponential backoff, while failing fast on permanent errors (4xx client errors).
8
9use anyhow::Result;
10use rand::RngExt;
11use reqwest::StatusCode;
12use std::time::{Duration, Instant};
13use tracing::{debug, error, warn};
14
15/// Maximum total time to spend retrying (5 minutes)
16const MAX_ELAPSED_TIME_SECS: u64 = 300;
17
18/// Initial retry interval (100ms)
19const INITIAL_INTERVAL_MILLIS: u64 = 100;
20
21/// Maximum interval between retries (30 seconds)
22const MAX_INTERVAL_SECS: u64 = 30;
23
24/// Backoff multiplier (exponential growth factor)
25const BACKOFF_MULTIPLIER: f64 = 2.0;
26
27/// Randomization factor to prevent thundering herd (±10%)
28const RANDOMIZATION_FACTOR: f64 = 0.1;
29
30/// HTTP retry initial interval (50ms) - faster than Kubernetes API
31const HTTP_INITIAL_INTERVAL_MILLIS: u64 = 50;
32
33/// HTTP retry maximum interval (10 seconds) - shorter than Kubernetes API
34const HTTP_MAX_INTERVAL_SECS: u64 = 10;
35
36/// HTTP retry maximum elapsed time (2 minutes) - shorter than Kubernetes API
37const HTTP_MAX_ELAPSED_TIME_SECS: u64 = 120;
38
39/// Simple exponential backoff implementation.
40///
41/// Provides exponential backoff with randomization (jitter) to prevent thundering herd.
42pub struct ExponentialBackoff {
43 /// Current interval duration
44 pub current_interval: Duration,
45 /// Initial interval duration (stored for potential reset functionality)
46 #[allow(dead_code)]
47 pub initial_interval: Duration,
48 /// Maximum interval duration
49 pub max_interval: Duration,
50 /// Maximum total elapsed time
51 pub max_elapsed_time: Option<Duration>,
52 /// Backoff multiplier (typically 2.0 for doubling)
53 pub multiplier: f64,
54 /// Randomization factor (e.g., 0.1 for ±10%)
55 pub randomization_factor: f64,
56 /// Start time for tracking total elapsed time
57 start_time: Instant,
58}
59
60impl ExponentialBackoff {
61 /// Create a new exponential backoff with specified parameters.
62 fn new(
63 initial_interval: Duration,
64 max_interval: Duration,
65 max_elapsed_time: Option<Duration>,
66 multiplier: f64,
67 randomization_factor: f64,
68 ) -> Self {
69 Self {
70 current_interval: initial_interval,
71 initial_interval,
72 max_interval,
73 max_elapsed_time,
74 multiplier,
75 randomization_factor,
76 start_time: Instant::now(),
77 }
78 }
79
80 /// Get the next backoff interval, or None if max elapsed time exceeded.
81 pub fn next_backoff(&mut self) -> Option<Duration> {
82 // Check if we've exceeded max elapsed time
83 if let Some(max_elapsed) = self.max_elapsed_time {
84 if self.start_time.elapsed() >= max_elapsed {
85 return None;
86 }
87 }
88
89 // Get current interval with jitter
90 let interval = self.current_interval;
91 let jittered = self.apply_jitter(interval);
92
93 // Calculate next interval (exponential growth)
94 let next = interval.as_secs_f64() * self.multiplier;
95 self.current_interval = Duration::from_secs_f64(next).min(self.max_interval);
96
97 Some(jittered)
98 }
99
100 /// Apply randomization (jitter) to an interval.
101 fn apply_jitter(&self, interval: Duration) -> Duration {
102 if self.randomization_factor == 0.0 {
103 return interval;
104 }
105
106 let secs = interval.as_secs_f64();
107 let delta = secs * self.randomization_factor;
108 let min = secs - delta;
109 let max = secs + delta;
110
111 let mut rng = rand::rng();
112 let jittered = rng.random_range(min..=max);
113
114 Duration::from_secs_f64(jittered.max(0.0))
115 }
116}
117
118/// Create default exponential backoff configuration for Kubernetes API retries.
119///
120/// # Configuration
121///
122/// - **Initial interval**: 100ms
123/// - **Max interval**: 30 seconds
124/// - **Max elapsed time**: 5 minutes total
125/// - **Multiplier**: 2.0 (exponential growth)
126/// - **Randomization**: ±10% (prevents thundering herd)
127///
128/// # Retry Schedule
129///
130/// With these settings, retries occur at approximately:
131///
132/// 1. 100ms
133/// 2. 200ms
134/// 3. 400ms
135/// 4. 800ms
136/// 5. 1.6s
137/// 6. 3.2s
138/// 7. 6.4s
139/// 8. 12.8s
140/// 9. 25.6s
141/// 10. 30s (capped at max interval)
142/// 11-30. 30s intervals until 5 minutes elapsed
143///
144/// # Returns
145///
146/// Configured `ExponentialBackoff` instance
147#[must_use]
148pub fn default_backoff() -> ExponentialBackoff {
149 ExponentialBackoff::new(
150 Duration::from_millis(INITIAL_INTERVAL_MILLIS),
151 Duration::from_secs(MAX_INTERVAL_SECS),
152 Some(Duration::from_secs(MAX_ELAPSED_TIME_SECS)),
153 BACKOFF_MULTIPLIER,
154 RANDOMIZATION_FACTOR,
155 )
156}
157
158/// Create exponential backoff configuration for HTTP API retries.
159///
160/// HTTP API calls (e.g., bindcar sidecar) use faster retry cycles than Kubernetes API
161/// since they target local/nearby services that should fail fast.
162///
163/// # Configuration
164///
165/// - **Initial interval**: 50ms
166/// - **Max interval**: 10 seconds
167/// - **Max elapsed time**: 2 minutes total
168/// - **Multiplier**: 2.0 (exponential growth)
169/// - **Randomization**: ±10% (prevents thundering herd)
170///
171/// # Retry Schedule
172///
173/// With these settings, retries occur at approximately:
174///
175/// 1. 50ms
176/// 2. 100ms
177/// 3. 200ms
178/// 4. 400ms
179/// 5. 800ms
180/// 6. 1.6s
181/// 7. 3.2s
182/// 8. 6.4s
183/// 9. 10s (capped at max interval)
184/// 10-12. 10s intervals until 2 minutes elapsed
185///
186/// # Returns
187///
188/// Configured `ExponentialBackoff` instance
189#[must_use]
190pub fn http_backoff() -> ExponentialBackoff {
191 ExponentialBackoff::new(
192 Duration::from_millis(HTTP_INITIAL_INTERVAL_MILLIS),
193 Duration::from_secs(HTTP_MAX_INTERVAL_SECS),
194 Some(Duration::from_secs(HTTP_MAX_ELAPSED_TIME_SECS)),
195 BACKOFF_MULTIPLIER,
196 RANDOMIZATION_FACTOR,
197 )
198}
199
200/// Determine if an HTTP status code is retryable.
201///
202/// # Retryable Status Codes
203///
204/// - **429** (Too Many Requests) - Rate limiting
205/// - **500** (Internal Server Error) - Server error
206/// - **502** (Bad Gateway) - Proxy/gateway error
207/// - **503** (Service Unavailable) - Temporary unavailability
208/// - **504** (Gateway Timeout) - Gateway timeout
209///
210/// # Arguments
211///
212/// * `status` - The HTTP status code to check
213///
214/// # Returns
215///
216/// `true` if the status code indicates a transient error, `false` otherwise
217#[must_use]
218pub fn is_retryable_http_status(status: StatusCode) -> bool {
219 matches!(
220 status,
221 StatusCode::TOO_MANY_REQUESTS
222 | StatusCode::INTERNAL_SERVER_ERROR
223 | StatusCode::BAD_GATEWAY
224 | StatusCode::SERVICE_UNAVAILABLE
225 | StatusCode::GATEWAY_TIMEOUT
226 )
227}
228
229/// Retry a Kubernetes API call with exponential backoff.
230///
231/// Automatically retries on transient errors (HTTP 429, 5xx) and fails immediately
232/// on permanent errors (4xx client errors except 429).
233///
234/// # Arguments
235///
236/// * `operation` - Async function that performs the API call
237/// * `operation_name` - Human-readable name for logging (e.g., "get cluster")
238///
239/// # Returns
240///
241/// Result of the API call after retries
242///
243/// # Errors
244///
245/// Returns error if:
246/// - Non-retryable error encountered (4xx client error)
247/// - Max elapsed time exceeded (5 minutes)
248/// - All retries exhausted
249///
250/// # Example
251///
252/// ```no_run
253/// use kube::{Api, Client};
254/// use bindy::crd::Bind9Cluster;
255/// use bindy::reconcilers::retry::retry_api_call;
256///
257/// # async fn example() -> anyhow::Result<()> {
258/// let client = Client::try_default().await?;
259/// let api: Api<Bind9Cluster> = Api::namespaced(client, "default");
260///
261/// let cluster = retry_api_call(
262/// || async { api.get("my-cluster").await.map_err(Into::into) },
263/// "get cluster my-cluster"
264/// ).await?;
265/// # Ok(())
266/// # }
267/// ```
268pub async fn retry_api_call<T, F, Fut>(mut operation: F, operation_name: &str) -> Result<T>
269where
270 F: FnMut() -> Fut,
271 Fut: std::future::Future<Output = Result<T, kube::Error>>,
272{
273 let mut backoff = default_backoff();
274 let start_time = Instant::now();
275 let mut attempt = 0;
276
277 loop {
278 attempt += 1;
279
280 let result = operation().await;
281
282 match result {
283 Ok(value) => {
284 if attempt > 1 {
285 debug!(
286 operation = operation_name,
287 attempt = attempt,
288 elapsed = ?start_time.elapsed(),
289 "Kubernetes API call succeeded after retries"
290 );
291 } else {
292 debug!(operation = operation_name, "Kubernetes API call succeeded");
293 }
294 return Ok(value);
295 }
296 Err(e) => {
297 // Check if error is retryable
298 if !is_retryable_error(&e) {
299 error!(
300 operation = operation_name,
301 error = %e,
302 "Non-retryable Kubernetes API error, failing immediately"
303 );
304 return Err(e.into());
305 }
306
307 // Check if we've exceeded max elapsed time
308 if let Some(max_elapsed) = backoff.max_elapsed_time {
309 if start_time.elapsed() >= max_elapsed {
310 error!(
311 operation = operation_name,
312 attempt = attempt,
313 elapsed = ?start_time.elapsed(),
314 error = %e,
315 "Max retry time exceeded, giving up"
316 );
317 return Err(anyhow::anyhow!(
318 "Max retry time exceeded after {attempt} attempts: {e}"
319 ));
320 }
321 }
322
323 // Calculate next backoff interval
324 if let Some(duration) = backoff.next_backoff() {
325 warn!(
326 operation = operation_name,
327 attempt = attempt,
328 retry_after = ?duration,
329 error = %e,
330 "Retryable Kubernetes API error, will retry"
331 );
332 tokio::time::sleep(duration).await;
333 } else {
334 error!(
335 operation = operation_name,
336 attempt = attempt,
337 elapsed = ?start_time.elapsed(),
338 error = %e,
339 "Backoff exhausted, giving up"
340 );
341 return Err(anyhow::anyhow!(
342 "Backoff exhausted after {attempt} attempts: {e}"
343 ));
344 }
345 }
346 }
347 }
348}
349
350/// Determine if a Kubernetes error is retryable.
351///
352/// # Retryable Errors
353///
354/// - **HTTP 429** (Too Many Requests) - Rate limiting
355/// - **HTTP 5xx** (Server Errors) - Temporary API server issues
356/// - **Service Errors** - Network/connection issues
357///
358/// # Non-Retryable Errors
359///
360/// - **HTTP 4xx** (Client Errors, except 429) - Invalid request, not found, unauthorized, etc.
361/// - **Invalid Request** - Malformed data, schema violations
362///
363/// # Arguments
364///
365/// * `err` - The Kubernetes API error to check
366///
367/// # Returns
368///
369/// `true` if the error is transient and should be retried, `false` otherwise
370fn is_retryable_error(err: &kube::Error) -> bool {
371 match err {
372 kube::Error::Api(api_err) => {
373 // Retry on rate limiting (429) and server errors (5xx)
374 api_err.code == 429 || (api_err.code >= 500 && api_err.code < 600)
375 }
376 kube::Error::Service(_) => {
377 // Network/connection errors are retryable
378 true
379 }
380 _ => {
381 // Client errors (invalid request, not found, etc.) are not retryable
382 false
383 }
384 }
385}
386
387#[cfg(test)]
388#[path = "retry_tests.rs"]
389mod retry_tests;