bindy/reconcilers/
dnszone.rs

1// Copyright (c) 2025 Erick Bourgeois, firestoned
2#![allow(clippy::uninlined_format_args)]
3#![allow(clippy::doc_markdown)]
4// SPDX-License-Identifier: MIT
5
6//! DNS zone reconciliation logic.
7//!
8//! This module handles the creation and management of DNS zones on BIND9 servers.
9//! It supports both primary and secondary zone configurations.
10
11// Module imports
12pub mod bind9_config;
13pub mod cleanup;
14pub mod constants;
15pub mod discovery;
16pub mod helpers;
17pub mod primary;
18pub mod secondary;
19pub mod status_helpers;
20pub mod types;
21pub mod validation;
22
23#[cfg(test)]
24#[path = "dnszone/helpers_tests.rs"]
25mod helpers_tests;
26
27// Bind9Instance and InstanceReferenceWithStatus are used by dead_code marked functions (Phase 2 cleanup)
28use self::types::DuplicateZoneInfo;
29#[allow(unused_imports)]
30use crate::crd::{Condition, DNSZone, DNSZoneStatus};
31use anyhow::{anyhow, Result};
32use bindcar::{ZONE_TYPE_PRIMARY, ZONE_TYPE_SECONDARY};
33use futures::stream::{self, StreamExt};
34use k8s_openapi::api::core::v1::{Pod, Service};
35use kube::{api::ListParams, client::Client, Api, ResourceExt};
36use std::collections::HashMap;
37use std::sync::Arc;
38use tokio::sync::Mutex;
39use tracing::{debug, error, info, warn};
40
41/// Creates a map of nameserver hostnames to IP addresses by:
42/// 1. Checking for Service external IPs first (`LoadBalancer` or `NodePort`)
43/// 2. Falling back to pod IPs if no external IPs are available
44///
45/// Nameservers are named: `ns1.{zone_name}.`, `ns2.{zone_name}.`, etc.
46/// Order: Primary instances first, then secondary instances.
47///
48/// # Arguments
49///
50/// * `client` - Kubernetes API client
51/// * `zone_name` - DNS zone name (e.g., "example.com")
52/// * `instance_refs` - All instance references (primaries and secondaries)
53///
54/// # Returns
55///
56/// `HashMap` of nameserver hostnames to IP addresses, or None if no IPs found
57///
58/// # Errors
59///
60/// Returns an error if Kubernetes API calls fail.
61pub async fn generate_nameserver_ips(
62    client: &Client,
63    zone_name: &str,
64    instance_refs: &[crate::crd::InstanceReference],
65) -> Result<Option<HashMap<String, String>>> {
66    if instance_refs.is_empty() {
67        return Ok(None);
68    }
69
70    let mut nameserver_ips = HashMap::new();
71    let mut ns_index = 1;
72
73    // Process primaries first, then secondaries
74    for instance_ref in instance_refs {
75        // Try to get Service external IP first
76        let service_api: Api<Service> = Api::namespaced(client.clone(), &instance_ref.namespace);
77
78        let ip = match service_api.get(&instance_ref.name).await {
79            Ok(service) => {
80                // Check for LoadBalancer external IP
81                if let Some(status) = &service.status {
82                    if let Some(load_balancer) = &status.load_balancer {
83                        if let Some(ingress_list) = &load_balancer.ingress {
84                            if let Some(ingress) = ingress_list.first() {
85                                if let Some(lb_ip) = &ingress.ip {
86                                    debug!(
87                                        "Using LoadBalancer IP {} for instance {}/{}",
88                                        lb_ip, instance_ref.namespace, instance_ref.name
89                                    );
90                                    Some(lb_ip.clone())
91                                } else {
92                                    None
93                                }
94                            } else {
95                                None
96                            }
97                        } else {
98                            None
99                        }
100                    } else {
101                        None
102                    }
103                } else {
104                    None
105                }
106            }
107            Err(e) => {
108                debug!(
109                    "Failed to get service for instance {}/{}: {}. Will try pod IP.",
110                    instance_ref.namespace, instance_ref.name, e
111                );
112                None
113            }
114        };
115
116        // If no service external IP, fallback to pod IP
117        let ip = if ip.is_none() {
118            // Get pod IP
119            let pod_api: Api<Pod> = Api::namespaced(client.clone(), &instance_ref.namespace);
120            let label_selector = format!("app=bind9,instance={}", instance_ref.name);
121            let lp = ListParams::default().labels(&label_selector);
122
123            match pod_api.list(&lp).await {
124                Ok(pods) => {
125                    // Find first running pod
126                    pods.items
127                        .iter()
128                        .find(|pod| {
129                            let phase = pod
130                                .status
131                                .as_ref()
132                                .and_then(|s| s.phase.as_ref())
133                                .map_or("Unknown", std::string::String::as_str);
134                            phase == "Running"
135                        })
136                        .and_then(|pod| {
137                            pod.status
138                                .as_ref()
139                                .and_then(|s| s.pod_ip.as_ref())
140                                .map(|ip| {
141                                    debug!(
142                                        "Using pod IP {} for instance {}/{}",
143                                        ip, instance_ref.namespace, instance_ref.name
144                                    );
145                                    ip.clone()
146                                })
147                        })
148                }
149                Err(e) => {
150                    warn!(
151                        "Failed to list pods for instance {}/{}: {}. Skipping.",
152                        instance_ref.namespace, instance_ref.name, e
153                    );
154                    None
155                }
156            }
157        } else {
158            ip
159        };
160
161        // Add to nameserver map if we found an IP
162        if let Some(ip) = ip {
163            let ns_hostname = format!("ns{ns_index}.{zone_name}.");
164            nameserver_ips.insert(ns_hostname, ip);
165            ns_index += 1;
166        }
167    }
168
169    if nameserver_ips.is_empty() {
170        Ok(None)
171    } else {
172        Ok(Some(nameserver_ips))
173    }
174}
175
176/// Get the effective nameservers list for a DNSZone, handling both new and deprecated fields.
177///
178/// This function provides backward compatibility by:
179/// 1. Preferring the new `name_servers` field if present
180/// 2. Falling back to the deprecated `name_server_ips` field with automatic migration
181/// 3. Logging deprecation warnings when the old field is used
182///
183/// # Arguments
184/// * `spec` - The DNSZone spec containing nameserver configuration
185///
186/// # Returns
187/// `Option<Vec<NameServer>>` - The effective list of nameservers, or `None` if neither field is set
188///
189/// # Examples
190///
191/// ```
192/// # #[allow(deprecated)]
193/// # use bindy::crd::{DNSZoneSpec, NameServer, SOARecord};
194/// # use std::collections::HashMap;
195/// // New field takes precedence
196/// let spec = DNSZoneSpec {
197///     zone_name: "example.com".into(),
198///     soa_record: SOARecord {
199///         primary_ns: "ns1.example.com.".into(),
200///         admin_email: "admin.example.com.".into(),
201///         serial: 1,
202///         refresh: 3600,
203///         retry: 600,
204///         expire: 604800,
205///         negative_ttl: 86400,
206///     },
207///     ttl: None,
208///     cluster_ref: None,
209///     name_servers: Some(vec![NameServer {
210///         hostname: "ns2.example.com.".into(),
211///         ipv4_address: None,
212///         ipv6_address: None,
213///     }]),
214///     name_server_ips: Some(HashMap::from([("ns3.example.com.".into(), "192.0.2.3".into())])),
215///     records_from: None,
216///     bind9_instances_from: None,
217///     dnssec_policy: None,
218/// };
219/// // Returns name_servers (new field), ignoring name_server_ips
220/// ```
221fn get_effective_name_servers(
222    spec: &crate::crd::DNSZoneSpec,
223) -> Option<Vec<crate::crd::NameServer>> {
224    use crate::crd::NameServer;
225
226    // New field takes precedence
227    if let Some(ref new_servers) = spec.name_servers {
228        debug!(
229            "Using new `nameServers` field with {} server(s)",
230            new_servers.len()
231        );
232        return Some(new_servers.clone());
233    }
234
235    // Fallback to deprecated field with migration warning
236    #[allow(deprecated)]
237    if let Some(ref old_ips) = spec.name_server_ips {
238        warn!(
239            "DNSZone uses deprecated `nameServerIps` field. \
240             Migrate to `nameServers` for better functionality and IPv6 support. \
241             See migration guide at docs/src/operations/migration-guide.md"
242        );
243
244        // Convert HashMap<String, String> to Vec<NameServer>
245        // Old format: {"ns2.example.com.": "192.0.2.2"}
246        // New format: vec![NameServer { hostname: "ns2.example.com.", ipv4_address: Some("192.0.2.2"), .. }]
247        let servers: Vec<NameServer> = old_ips
248            .iter()
249            .map(|(hostname, ip)| NameServer {
250                hostname: hostname.clone(),
251                ipv4_address: Some(ip.clone()),
252                ipv6_address: None, // Old field doesn't support IPv6
253            })
254            .collect();
255
256        debug!(
257            "Migrated {} server(s) from deprecated `nameServerIps` to new format",
258            servers.len()
259        );
260
261        return Some(servers);
262    }
263
264    // Neither field set
265    None
266}
267
268/// Re-fetch a DNSZone to get the latest status.
269///
270/// The `dnszone` parameter from the watch event might have stale status from the cache.
271/// We need the latest `status.bind9Instances` which may have been updated by the
272/// Bind9Instance reconciler.
273///
274/// # Arguments
275/// * `client` - Kubernetes client
276/// * `namespace` - Namespace of the DNSZone
277/// * `name` - Name of the DNSZone
278///
279/// # Returns
280/// The freshly fetched DNSZone with current status
281///
282/// # Errors
283/// Returns an error if the Kubernetes API call fails
284async fn refetch_zone(client: &kube::Client, namespace: &str, name: &str) -> Result<DNSZone> {
285    let zones_api: Api<DNSZone> = Api::namespaced(client.clone(), namespace);
286    let zone = zones_api.get(name).await?;
287    Ok(zone)
288}
289
290/// Handle duplicate zone conflicts by setting Ready=False and stopping reconciliation.
291///
292/// When a duplicate zone is detected, this function:
293/// 1. Logs a warning with details about the conflict
294/// 2. Updates the status with Ready=False and DuplicateZone condition
295/// 3. Applies the status to the API server
296///
297/// # Arguments
298/// * `client` - Kubernetes client
299/// * `namespace` - Namespace of the conflicting DNSZone
300/// * `name` - Name of the conflicting DNSZone
301/// * `duplicate_info` - Information about the duplicate zone conflict
302/// * `status_updater` - Status updater to apply the condition
303///
304/// # Errors
305/// Returns an error if the status update fails
306async fn handle_duplicate_zone(
307    client: &kube::Client,
308    namespace: &str,
309    name: &str,
310    duplicate_info: &DuplicateZoneInfo,
311    status_updater: &mut crate::reconcilers::status::DNSZoneStatusUpdater,
312) -> Result<()> {
313    warn!(
314        "Duplicate zone detected: {}/{} cannot claim '{}' because it is already configured by: {:?}",
315        namespace, name, duplicate_info.zone_name, duplicate_info.conflicting_zones
316    );
317
318    // Build list of conflicting zones in namespace/name format
319    let conflicting_zone_refs: Vec<String> = duplicate_info
320        .conflicting_zones
321        .iter()
322        .map(|z| format!("{}/{}", z.namespace, z.name))
323        .collect();
324
325    // Set Ready=False with DuplicateZone reason
326    status_updater.set_duplicate_zone_condition(&duplicate_info.zone_name, &conflicting_zone_refs);
327
328    // Apply status and stop processing
329    status_updater.apply(client).await?;
330
331    Ok(())
332}
333
334/// Detect if the zone spec has changed since last reconciliation.
335///
336/// Compares current generation with observed generation to determine
337/// if this is first reconciliation or if spec changed.
338///
339/// # Arguments
340///
341/// * `zone` - The DNSZone resource
342///
343/// # Returns
344///
345/// Tuple of (first_reconciliation, spec_changed)
346fn detect_spec_changes(zone: &DNSZone) -> (bool, bool) {
347    let current_generation = zone.metadata.generation;
348    let observed_generation = zone.status.as_ref().and_then(|s| s.observed_generation);
349
350    let first_reconciliation = observed_generation.is_none();
351    let spec_changed =
352        crate::reconcilers::should_reconcile(current_generation, observed_generation);
353
354    (first_reconciliation, spec_changed)
355}
356
357/// Detect if the instance list changed between watch event and re-fetch.
358///
359/// This is critical for detecting when:
360/// 1. New instances are added to `status.bind9Instances` (via `bind9InstancesFrom` selectors)
361/// 2. Instance `lastReconciledAt` timestamps are cleared (e.g., instance deleted, needs reconfiguration)
362///
363/// NOTE: `InstanceReference` `PartialEq` ignores `lastReconciledAt`, so we must check timestamps separately!
364///
365/// # Arguments
366///
367/// * `namespace` - Namespace for logging
368/// * `name` - Zone name for logging
369/// * `watch_instances` - Instances from the watch event that triggered reconciliation
370/// * `current_instances` - Instances after re-fetching (current state)
371///
372/// # Returns
373///
374/// `true` if instances changed (list or timestamps), `false` otherwise
375fn detect_instance_changes(
376    namespace: &str,
377    name: &str,
378    watch_instances: Option<&Vec<crate::crd::InstanceReference>>,
379    current_instances: &[crate::crd::InstanceReference],
380) -> bool {
381    let Some(watch_instances) = watch_instances else {
382        // No instances in watch event, first reconciliation or error
383        return true;
384    };
385
386    // Get the instance names from the watch event (what triggered us)
387    let watch_instance_names: std::collections::HashSet<_> =
388        watch_instances.iter().map(|r| &r.name).collect();
389
390    // Get the instance names after re-fetching (current state)
391    let current_instance_names: std::collections::HashSet<_> =
392        current_instances.iter().map(|r| &r.name).collect();
393
394    // Check if instance list changed (added/removed instances)
395    let list_changed = watch_instance_names != current_instance_names;
396
397    if list_changed {
398        info!(
399            "Instance list changed during reconciliation for zone {}/{}: watch_event={:?}, current={:?}",
400            namespace, name, watch_instance_names, current_instance_names
401        );
402        return true;
403    }
404
405    // List is the same, but check if any lastReconciledAt timestamps changed
406    // Use InstanceReference as HashMap key (uses its Hash impl which hashes identity fields)
407    let watch_timestamps: std::collections::HashMap<&crate::crd::InstanceReference, Option<&str>> =
408        watch_instances
409            .iter()
410            .map(|inst| (inst, inst.last_reconciled_at.as_deref()))
411            .collect();
412
413    let current_timestamps: std::collections::HashMap<
414        &crate::crd::InstanceReference,
415        Option<&str>,
416    > = current_instances
417        .iter()
418        .map(|inst| (inst, inst.last_reconciled_at.as_deref()))
419        .collect();
420
421    let timestamps_changed = watch_timestamps.iter().any(|(inst_ref, watch_ts)| {
422        current_timestamps
423            .get(inst_ref)
424            .is_some_and(|current_ts| current_ts != watch_ts)
425    });
426
427    if timestamps_changed {
428        info!(
429            "Instance lastReconciledAt timestamps changed for zone {}/{}",
430            namespace, name
431        );
432    }
433
434    timestamps_changed
435}
436
437/// Reconciles a `DNSZone` resource.
438///
439/// Creates or updates DNS zone files on BIND9 instances that match the zone's
440/// instance selector. Supports both primary and secondary zone types.
441///
442/// # Zone Types
443///
444/// - **Primary**: Authoritative zone with SOA record and local zone file
445/// - **Secondary**: Replica zone that transfers from primary servers
446///
447/// # Arguments
448///
449/// * `client` - Kubernetes API client for finding matching `Bind9Instances`
450/// * `dnszone` - The `DNSZone` resource to reconcile
451/// * `zone_manager` - BIND9 manager for creating zone files
452///
453/// # Returns
454///
455/// * `Ok(())` - If zone was created/updated successfully
456/// * `Err(_)` - If zone creation failed or configuration is invalid
457///
458/// # Example
459///
460/// ```rust,no_run,ignore
461/// use bindy::reconcilers::reconcile_dnszone;
462/// use bindy::crd::DNSZone;
463/// use bindy::bind9::Bind9Manager;
464/// use bindy::context::Context;
465/// use std::sync::Arc;
466///
467/// async fn handle_zone(ctx: Arc<Context>, zone: DNSZone) -> anyhow::Result<()> {
468///     let manager = Bind9Manager::new();
469///     reconcile_dnszone(ctx, zone, &manager).await?;
470///     Ok(())
471/// }
472/// ```
473///
474/// # Errors
475///
476/// Returns an error if Kubernetes API operations fail or BIND9 zone operations fail.
477#[allow(clippy::too_many_lines)]
478pub async fn reconcile_dnszone(
479    ctx: Arc<crate::context::Context>,
480    dnszone: DNSZone,
481    zone_manager: &crate::bind9::Bind9Manager,
482) -> Result<()> {
483    let client = ctx.client.clone();
484    let bind9_instances_store = &ctx.stores.bind9_instances;
485
486    let namespace = dnszone.namespace().unwrap_or_default();
487    let name = dnszone.name_any();
488
489    info!("Reconciling DNSZone: {}/{}", namespace, name);
490    debug!(
491        namespace = %namespace,
492        name = %name,
493        generation = ?dnszone.metadata.generation,
494        "Starting DNSZone reconciliation"
495    );
496
497    // Save the instance list from the watch event (before re-fetching)
498    // This represents the instances that triggered this reconciliation
499    let watch_event_instances =
500        validation::get_instances_from_zone(&dnszone, bind9_instances_store).ok();
501
502    // CRITICAL: Re-fetch the zone to get the latest status
503    let dnszone = refetch_zone(&client, &namespace, &name).await?;
504
505    // Create centralized status updater to batch all status changes
506    let mut status_updater = crate::reconcilers::status::DNSZoneStatusUpdater::new(&dnszone);
507
508    // Extract spec
509    let spec = &dnszone.spec;
510
511    // Validate that zone has instances assigned (via spec.bind9Instances or status.bind9Instances)
512    // This will fail early if zone is not selected by any instance
513    let instance_refs = validation::get_instances_from_zone(&dnszone, bind9_instances_store)?;
514
515    info!(
516        "DNSZone {}/{} is assigned to {} instance(s): {:?}",
517        namespace,
518        name,
519        instance_refs.len(),
520        instance_refs.iter().map(|r| &r.name).collect::<Vec<_>>()
521    );
522
523    // CRITICAL: Check for duplicate zones BEFORE any configuration
524    // If another zone already claims this zone name, set Ready=False with DuplicateZone reason
525    // and stop processing to prevent conflicting DNS configurations
526    let zones_store = &ctx.stores.dnszones;
527    if let Some(duplicate_info) = validation::check_for_duplicate_zones(&dnszone, zones_store) {
528        handle_duplicate_zone(
529            &client,
530            &namespace,
531            &name,
532            &duplicate_info,
533            &mut status_updater,
534        )
535        .await?;
536        return Ok(());
537    }
538
539    // Determine if this is the first reconciliation or if spec has changed
540    let (first_reconciliation, spec_changed) = detect_spec_changes(&dnszone);
541
542    // Check if the instance list or lastReconciledAt timestamps changed between watch event and re-fetch
543    let instances_changed = detect_instance_changes(
544        &namespace,
545        &name,
546        watch_event_instances.as_ref(),
547        &instance_refs,
548    );
549
550    // Check if any instances need reconciliation (never reconciled or reconciliation failed)
551    let unreconciled_instances =
552        validation::filter_instances_needing_reconciliation(&instance_refs);
553    let has_unreconciled_instances = !unreconciled_instances.is_empty();
554
555    if has_unreconciled_instances {
556        info!(
557            "Found {} unreconciled instance(s) for zone {}/{}: {:?}",
558            unreconciled_instances.len(),
559            namespace,
560            name,
561            unreconciled_instances
562                .iter()
563                .map(|i| format!("{}/{}", i.namespace, i.name))
564                .collect::<Vec<_>>()
565        );
566    } else {
567        debug!(
568            "No unreconciled instances for zone {}/{} - all {} instance(s) already configured (lastReconciledAt set)",
569            namespace,
570            name,
571            instance_refs.len()
572        );
573    }
574
575    // CRITICAL: Cleanup deleted instances BEFORE early return check
576    // If we skip reconciliation due to no changes, we still need to remove deleted instances from status
577    match cleanup::cleanup_deleted_instances(&client, &dnszone, &mut status_updater).await {
578        Ok(deleted_count) if deleted_count > 0 => {
579            info!(
580                "Cleaned up {} deleted instance(s) from zone {}/{} status",
581                deleted_count, namespace, name
582            );
583        }
584        Ok(_) => {
585            debug!(
586                "No deleted instances found for zone {}/{} status",
587                namespace, name
588            );
589        }
590        Err(e) => {
591            warn!(
592                "Failed to cleanup deleted instances for zone {}/{}: {} (continuing with reconciliation)",
593                namespace, name, e
594            );
595            // Don't fail reconciliation for cleanup errors
596        }
597    }
598
599    // CRITICAL: We CANNOT skip reconciliation entirely, even if spec and instances haven't changed.
600    // Reconciliation may be triggered by ARecord/AAAA/TXT/etc changes via watches, and we MUST
601    // run record discovery to tag newly created records with status.zoneRef.
602    //
603    // However, we CAN skip BIND9 configuration if nothing changed (handled later in the flow).
604    // This ensures record discovery ALWAYS runs while still optimizing BIND9 API calls.
605
606    if instances_changed {
607        info!(
608            "Instances changed for zone {}/{} - reconciling to configure new instances",
609            namespace, name
610        );
611    }
612
613    info!(
614        "Reconciling zone {} (first_reconciliation={}, spec_changed={})",
615        spec.zone_name, first_reconciliation, spec_changed
616    );
617
618    // Cleanup stale records from status.records[] before main reconciliation
619    // This ensures status stays in sync with actual Kubernetes resources
620    match cleanup::cleanup_stale_records(
621        &client,
622        &dnszone,
623        &mut status_updater,
624        bind9_instances_store,
625    )
626    .await
627    {
628        Ok(stale_count) if stale_count > 0 => {
629            info!(
630                "Cleaned up {} stale record(s) from zone {}/{} status",
631                stale_count, namespace, name
632            );
633        }
634        Ok(_) => {
635            debug!(
636                "No stale records found in zone {}/{} status",
637                namespace, name
638            );
639        }
640        Err(e) => {
641            warn!(
642                "Failed to cleanup stale records for zone {}/{}: {} (continuing with reconciliation)",
643                namespace, name, e
644            );
645            // Don't fail reconciliation for cleanup errors
646        }
647    }
648
649    // BIND9 configuration: Always ensure zones exist on all instances
650    // This implements true declarative reconciliation - if a pod restarts without
651    // persistent storage, the reconciler will detect the missing zone and recreate it.
652    // The add_zones() function is idempotent, so this is safe to call every reconciliation.
653    //
654    // NOTE: We ALWAYS configure zones, not just when spec changes. This ensures:
655    // - Zones are recreated if pods restart without persistent volumes
656    // - New instances added to the cluster get zones automatically
657    // - Drift detection: if someone manually deletes a zone, it's recreated
658    // - True Kubernetes declarative reconciliation: actual state continuously matches desired state
659    let (primary_count, secondary_count) = bind9_config::configure_zone_on_instances(
660        ctx.clone(),
661        &dnszone,
662        zone_manager,
663        &mut status_updater,
664        &instance_refs,
665        &unreconciled_instances,
666    )
667    .await?;
668
669    // Discover DNS records and update status
670    let (record_refs, records_count) =
671        discovery::discover_and_update_records(&client, &dnszone, &mut status_updater).await?;
672
673    // Check if all discovered records are ready and trigger zone transfers if needed
674    if records_count > 0 {
675        let all_records_ready =
676            discovery::check_all_records_ready(&client, &namespace, &record_refs).await?;
677
678        if all_records_ready {
679            info!(
680                "All {} record(s) for zone {} are ready, triggering zone transfers to secondaries",
681                records_count, spec.zone_name
682            );
683
684            // Trigger zone transfers to all secondaries
685            // Zone transfers are triggered automatically by BIND9 via NOTIFY messages
686            // No manual trigger needed in the new architecture
687            info!(
688                "Zone {} configured on instances - BIND9 will handle zone transfers via NOTIFY",
689                spec.zone_name
690            );
691        } else {
692            info!("Not all records for zone {} are ready yet", spec.zone_name);
693        }
694    }
695    // Calculate expected counts and finalize status
696    let (expected_primary_count, expected_secondary_count) =
697        status_helpers::calculate_expected_instance_counts(&client, &instance_refs).await?;
698
699    status_helpers::finalize_zone_status(
700        &mut status_updater,
701        &client,
702        &spec.zone_name,
703        &namespace,
704        &name,
705        primary_count,
706        secondary_count,
707        expected_primary_count,
708        expected_secondary_count,
709        records_count,
710        dnszone.metadata.generation,
711    )
712    .await?;
713
714    // Trigger record reconciliation: Update all matching records with a "zone-reconciled" annotation
715    // This ensures records are re-added to BIND9 after pod restarts or zone recreation
716    if !status_updater.has_degraded_condition() {
717        if let Err(e) =
718            discovery::trigger_record_reconciliation(&client, &namespace, &spec.zone_name).await
719        {
720            warn!(
721                "Failed to trigger record reconciliation for zone {}: {}",
722                spec.zone_name, e
723            );
724            // Don't fail the entire reconciliation for this - records will eventually reconcile
725        }
726    }
727
728    Ok(())
729}
730
731/// Adds a DNS zone to all primary instances.
732///
733/// # Arguments
734///
735/// * `client` - Kubernetes API client
736/// * `dnszone` - The `DNSZone` resource
737/// * `zone_manager` - BIND9 manager for adding zone
738///
739/// # Returns
740///
741/// * `Ok(usize)` - Number of primary endpoints successfully configured
742/// * `Err(_)` - If zone addition failed
743///
744/// # Errors
745///
746/// Returns an error if BIND9 zone addition fails or if no instances are assigned.
747///
748/// # Panics
749///
750/// Panics if the RNDC key is not loaded by the helper function (should never happen in practice).
751#[allow(clippy::too_many_lines)]
752pub async fn add_dnszone(
753    ctx: Arc<crate::context::Context>,
754    dnszone: DNSZone,
755    zone_manager: &crate::bind9::Bind9Manager,
756    status_updater: &mut crate::reconcilers::status::DNSZoneStatusUpdater,
757    instance_refs: &[crate::crd::InstanceReference],
758) -> Result<usize> {
759    let client = ctx.client.clone();
760    let namespace = dnszone.namespace().unwrap_or_default();
761    let name = dnszone.name_any();
762    let spec = &dnszone.spec;
763
764    info!("Adding DNSZone {}/{}", namespace, name);
765
766    // PHASE 2 OPTIMIZATION: Use the filtered instance list passed by the caller
767    // This ensures we only process instances that need reconciliation (lastReconciledAt == None)
768
769    info!(
770        "DNSZone {}/{} will be added to {} instance(s): {:?}",
771        namespace,
772        name,
773        instance_refs.len(),
774        instance_refs
775            .iter()
776            .map(|i| format!("{}/{}", i.namespace, i.name))
777            .collect::<Vec<_>>()
778    );
779
780    // Filter to only PRIMARY instances
781    let primary_instance_refs = primary::filter_primary_instances(&client, instance_refs).await?;
782
783    if primary_instance_refs.is_empty() {
784        return Err(anyhow!(
785            "DNSZone {}/{} has no PRIMARY instances assigned. Instances: {:?}",
786            namespace,
787            name,
788            instance_refs
789                .iter()
790                .map(|i| format!("{}/{}", i.namespace, i.name))
791                .collect::<Vec<_>>()
792        ));
793    }
794
795    info!(
796        "Found {} PRIMARY instance(s) for DNSZone {}/{}",
797        primary_instance_refs.len(),
798        namespace,
799        name
800    );
801
802    // Find all secondary instances for zone transfer configuration
803    let secondary_instance_refs =
804        secondary::filter_secondary_instances(&client, instance_refs).await?;
805    let secondary_ips =
806        secondary::find_secondary_pod_ips_from_instances(&client, &secondary_instance_refs).await?;
807
808    if secondary_ips.is_empty() {
809        warn!(
810            "No secondary servers found for DNSZone {}/{} - zone transfers will not be configured",
811            namespace, name
812        );
813    } else {
814        info!(
815            "Found {} secondary server(s) for DNSZone {}/{} - zone transfers will be configured: {:?}",
816            secondary_ips.len(),
817            namespace,
818            name,
819            secondary_ips
820        );
821    }
822
823    // Get effective nameservers (supports both new `nameServers` and deprecated `nameServerIps`)
824    let effective_name_servers = get_effective_name_servers(spec);
825
826    // Generate legacy nameserver IPs format for backward compatibility with bindcar API
827    // If user didn't provide either field, auto-generate from instance IPs
828    let name_server_ips = if effective_name_servers.is_none() {
829        info!(
830            "DNSZone {}/{} has no explicit nameServers - auto-generating from {} instance(s)",
831            namespace,
832            name,
833            instance_refs.len()
834        );
835
836        // Build ordered list: primaries first, then secondaries
837        let mut ordered_instances = primary_instance_refs.clone();
838        ordered_instances.extend(secondary_instance_refs.clone());
839
840        match generate_nameserver_ips(&client, &spec.zone_name, &ordered_instances).await {
841            Ok(Some(generated_ips)) => {
842                info!(
843                    "Auto-generated {} nameserver(s) for DNSZone {}/{}: {:?}",
844                    generated_ips.len(),
845                    namespace,
846                    name,
847                    generated_ips
848                );
849                Some(generated_ips)
850            }
851            Ok(None) => {
852                warn!(
853                    "Failed to auto-generate nameserver IPs for DNSZone {}/{} - no IPs available",
854                    namespace, name
855                );
856                None
857            }
858            Err(e) => {
859                warn!(
860                    "Error auto-generating nameserver IPs for DNSZone {}/{}: {}",
861                    namespace, name, e
862                );
863                None
864            }
865        }
866    } else {
867        // Convert effective_name_servers to HashMap<String, String> for bindcar API compatibility
868        // Only include IPv4 addresses (bindcar doesn't support IPv6 glue records in this field)
869        // SAFETY: We know effective_name_servers is Some because we're in the else block
870        let name_server_map: HashMap<String, String> =
871            if let Some(ref ns_list) = effective_name_servers {
872                ns_list
873                    .iter()
874                    .filter_map(|ns| {
875                        ns.ipv4_address
876                            .as_ref()
877                            .map(|ip| (ns.hostname.clone(), ip.clone()))
878                    })
879                    .collect()
880            } else {
881                HashMap::new()
882            };
883
884        info!(
885            "Using explicit nameServers for DNSZone {}/{} ({} with IPv4 glue records)",
886            namespace,
887            name,
888            name_server_map.len()
889        );
890
891        if name_server_map.is_empty() {
892            None
893        } else {
894            Some(name_server_map)
895        }
896    };
897
898    // Extract list of ALL nameserver hostnames (primary from SOA + all from nameServers field)
899    // This is used by bindcar to generate NS records in the zone file
900    let all_nameserver_hostnames: Vec<String> = {
901        let mut hostnames = vec![spec.soa_record.primary_ns.clone()];
902
903        if let Some(ref ns_list) = effective_name_servers {
904            for ns in ns_list {
905                // Avoid duplicates - don't add primary NS again if it's in the list
906                if ns.hostname != spec.soa_record.primary_ns {
907                    hostnames.push(ns.hostname.clone());
908                }
909            }
910        }
911
912        hostnames
913    };
914
915    info!(
916        "Zone {}/{} will be configured with {} nameserver(s): {:?}",
917        namespace,
918        name,
919        all_nameserver_hostnames.len(),
920        all_nameserver_hostnames
921    );
922
923    // Extract DNSSEC policy if configured
924    let dnssec_policy = spec.dnssec_policy.as_deref();
925    if let Some(policy) = dnssec_policy {
926        info!(
927            "DNSSEC policy '{}' will be applied to zone {}/{}",
928            policy, namespace, name
929        );
930    }
931
932    // Process all primary instances concurrently using async streams
933    // Mark each instance as reconciled immediately after first successful endpoint configuration
934    let first_endpoint = Arc::new(Mutex::new(None::<String>));
935    let total_endpoints = Arc::new(Mutex::new(0_usize));
936    let errors = Arc::new(Mutex::new(Vec::<String>::new()));
937    let status_updater_shared = Arc::new(Mutex::new(status_updater));
938
939    // Create a stream of futures for all instances
940    let _instance_results = stream::iter(primary_instance_refs.iter())
941        .then(|instance_ref| {
942            let client = client.clone();
943            let zone_manager = zone_manager.clone();
944            let zone_name = spec.zone_name.clone();
945            let soa_record = spec.soa_record.clone();
946            let all_nameserver_hostnames = all_nameserver_hostnames.clone();
947            let name_server_ips = name_server_ips.clone();
948            let secondary_ips = secondary_ips.clone();
949            let first_endpoint = Arc::clone(&first_endpoint);
950            let total_endpoints = Arc::clone(&total_endpoints);
951            let errors = Arc::clone(&errors);
952            let status_updater_shared = Arc::clone(&status_updater_shared);
953            let instance_ref = instance_ref.clone();
954            let _zone_namespace = namespace.clone();
955            let _zone_name_ref = name.clone();
956
957            async move {
958                info!(
959                    "Processing endpoints for primary instance {}/{}",
960                    instance_ref.namespace, instance_ref.name
961                );
962
963                // Load RNDC key for this specific instance
964                let key_data = match helpers::load_rndc_key(&client, &instance_ref.namespace, &instance_ref.name).await {
965                    Ok(key) => key,
966                    Err(e) => {
967                        let err_msg = format!("instance {}/{}: failed to load RNDC key: {e}", instance_ref.namespace, instance_ref.name);
968                        errors.lock().await.push(err_msg);
969                        return;
970                    }
971                };
972
973                // Get all endpoints for this instance
974                let endpoints = match helpers::get_endpoint(&client, &instance_ref.namespace, &instance_ref.name, "http").await {
975                    Ok(eps) => eps,
976                    Err(e) => {
977                        let err_msg = format!("instance {}/{}: failed to get endpoints: {e}", instance_ref.namespace, instance_ref.name);
978                        errors.lock().await.push(err_msg);
979                        return;
980                    }
981                };
982
983                info!(
984                    "Found {} endpoint(s) for primary instance {}/{}",
985                    endpoints.len(),
986                    instance_ref.namespace,
987                    instance_ref.name
988                );
989
990                // Process endpoints concurrently for this instance
991                let endpoint_results = stream::iter(endpoints.iter())
992                    .then(|endpoint| {
993                        let zone_manager = zone_manager.clone();
994                        let zone_name = zone_name.clone();
995                        let key_data = key_data.clone();
996                        let soa_record = soa_record.clone();
997                        let all_nameserver_hostnames = all_nameserver_hostnames.clone();
998                        let name_server_ips = name_server_ips.clone();
999                        let secondary_ips = secondary_ips.clone();
1000                        let first_endpoint = Arc::clone(&first_endpoint);
1001                        let total_endpoints = Arc::clone(&total_endpoints);
1002                        let errors = Arc::clone(&errors);
1003                        let instance_ref = instance_ref.clone();
1004                        let endpoint = endpoint.clone();
1005
1006                        async move {
1007                            let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1008
1009                            // Save the first endpoint (globally)
1010                            {
1011                                let mut first = first_endpoint.lock().await;
1012                                if first.is_none() {
1013                                    *first = Some(pod_endpoint.clone());
1014                                }
1015                            }
1016
1017                            // Check if zone already exists before attempting creation
1018                            let zone_exists = match zone_manager.zone_exists(&zone_name, &pod_endpoint).await {
1019                                Ok(exists) => exists,
1020                                Err(e) => {
1021                                    error!(
1022                                        "Failed to check if zone {} exists on endpoint {} (instance {}/{}): {}",
1023                                        zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1024                                    );
1025                                    // Treat errors as "zone might not exist" - proceed with add_zones
1026                                    false
1027                                }
1028                            };
1029
1030                            if zone_exists {
1031                                debug!(
1032                                    "Zone {} already exists on endpoint {} (instance {}/{}), skipping creation",
1033                                    zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1034                                );
1035                                *total_endpoints.lock().await += 1;
1036                                // Return false to indicate zone was not newly added
1037                                return Ok(false);
1038                            }
1039
1040                            // Pass secondary IPs for zone transfer configuration
1041                            let secondary_ips_ref = if secondary_ips.is_empty() {
1042                                None
1043                            } else {
1044                                Some(secondary_ips.as_slice())
1045                            };
1046
1047                            match zone_manager
1048                                .add_zones(
1049                                    &zone_name,
1050                                    ZONE_TYPE_PRIMARY,
1051                                    &pod_endpoint,
1052                                    &key_data,
1053                                    Some(&soa_record),
1054                                    Some(&all_nameserver_hostnames),
1055                                    name_server_ips.as_ref(),
1056                                    secondary_ips_ref,
1057                                    None, // primary_ips only for secondary zones
1058                                    dnssec_policy,
1059                                )
1060                                .await
1061                            {
1062                                Ok(was_added) => {
1063                                    if was_added {
1064                                        info!(
1065                                            "Successfully added zone {} to endpoint {} (instance: {}/{})",
1066                                            zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1067                                        );
1068                                    }
1069                                    *total_endpoints.lock().await += 1;
1070                                    // Return was_added so we can check if zone was actually configured
1071                                    Ok(was_added)
1072                                }
1073                                Err(e) => {
1074                                    error!(
1075                                        "Failed to add zone {} to endpoint {} (instance {}/{}): {}",
1076                                        zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1077                                    );
1078                                    errors.lock().await.push(format!(
1079                                        "endpoint {pod_endpoint} (instance {}/{}): {e}",
1080                                        instance_ref.namespace, instance_ref.name
1081                                    ));
1082                                    Err(())
1083                                }
1084                            }
1085                        }
1086                    })
1087                    .collect::<Vec<Result<bool, ()>>>()
1088                    .await;
1089
1090                // Mark this instance as configured ONLY if at least one endpoint actually added the zone
1091                // This prevents updating lastReconciledAt when zone already exists (avoids tight loop)
1092                let zone_was_configured = endpoint_results.iter().any(|r| r.is_ok() && *r.as_ref().unwrap());
1093                if zone_was_configured {
1094                    status_updater_shared
1095                        .lock()
1096                        .await
1097                        .update_instance_status(
1098                            &instance_ref.name,
1099                            &instance_ref.namespace,
1100                            crate::crd::InstanceStatus::Configured,
1101                            Some("Zone successfully configured on primary instance".to_string()),
1102                        );
1103                    info!(
1104                        "Marked primary instance {}/{} as configured for zone {}",
1105                        instance_ref.namespace, instance_ref.name, zone_name
1106                    );
1107
1108                    // PHASE 2 COMPLETION: Update Bind9Instance.status.selectedZones[].lastReconciledAt
1109                    // This signals successful zone configuration and prevents infinite reconciliation loops
1110                    // STUB: No longer needed - function is a no-op
1111                    // update_zone_reconciled_timestamp(
1112                    //     &client,
1113                    //     &instance_ref.name,
1114                    //     &instance_ref.namespace,
1115                    //     &zone_name_ref,
1116                    //     &zone_namespace,
1117                    // );
1118                }
1119            }
1120        })
1121        .collect::<Vec<()>>()
1122        .await;
1123
1124    let first_endpoint = Arc::try_unwrap(first_endpoint)
1125        .expect("Failed to unwrap first_endpoint Arc")
1126        .into_inner();
1127    let total_endpoints = Arc::try_unwrap(total_endpoints)
1128        .expect("Failed to unwrap total_endpoints Arc")
1129        .into_inner();
1130    let errors = Arc::try_unwrap(errors)
1131        .expect("Failed to unwrap errors Arc")
1132        .into_inner();
1133    let _status_updater = Arc::try_unwrap(status_updater_shared)
1134        .map_err(|_| anyhow!("Failed to unwrap status_updater - multiple references remain"))?
1135        .into_inner();
1136
1137    // If ALL operations failed, return an error
1138    if total_endpoints == 0 && !errors.is_empty() {
1139        return Err(anyhow!(
1140            "Failed to add zone {} to all primary instances. Errors: {}",
1141            spec.zone_name,
1142            errors.join("; ")
1143        ));
1144    }
1145
1146    info!(
1147        "Successfully added zone {} to {} endpoint(s) across {} primary instance(s)",
1148        spec.zone_name,
1149        total_endpoints,
1150        primary_instance_refs.len()
1151    );
1152
1153    // Auto-generate NS records and glue records from nameServers field
1154    if let Some(ref name_servers) = effective_name_servers {
1155        if !name_servers.is_empty() {
1156            info!(
1157                "Auto-generating NS records for {} nameserver(s) in zone {}",
1158                name_servers.len(),
1159                spec.zone_name
1160            );
1161
1162            if let Err(e) = auto_generate_ns_records(
1163                &client,
1164                name_servers,
1165                &spec.zone_name,
1166                spec.ttl,
1167                &primary_instance_refs,
1168            )
1169            .await
1170            {
1171                warn!(
1172                    "Failed to auto-generate some NS records for zone {}: {}. \
1173                     Zone is functional but may have incomplete NS records.",
1174                    spec.zone_name, e
1175                );
1176                // Don't fail reconciliation - zone is functional even without all NS records
1177            }
1178        }
1179    }
1180
1181    // Note: We don't need to reload after addzone because:
1182    // 1. rndc addzone immediately adds the zone to BIND9's running config
1183    // 2. The zone file will be created automatically when records are added via dynamic updates
1184    // 3. Reloading would fail if the zone file doesn't exist yet
1185
1186    // Notify secondaries about the new zone via the first endpoint
1187    // This triggers zone transfer (AXFR) from primary to secondaries
1188    if let Some(first_pod_endpoint) = first_endpoint {
1189        info!("Notifying secondaries about new zone {}", spec.zone_name);
1190        if let Err(e) = zone_manager
1191            .notify_zone(&spec.zone_name, &first_pod_endpoint)
1192            .await
1193        {
1194            // Don't fail if NOTIFY fails - the zone was successfully created
1195            // Secondaries will sync via SOA refresh timer
1196            warn!(
1197                "Failed to notify secondaries for zone {}: {}. Secondaries will sync via SOA refresh timer.",
1198                spec.zone_name, e
1199            );
1200        }
1201    } else {
1202        warn!(
1203            "No endpoints found for zone {}, cannot notify secondaries",
1204            spec.zone_name
1205        );
1206    }
1207
1208    Ok(total_endpoints)
1209}
1210
1211/// Adds a DNS zone to all secondary instances in the cluster with primaries configured.
1212///
1213/// Creates secondary zones on all secondary instances, configuring them to transfer
1214/// from the provided primary server IPs. If a zone already exists on a secondary,
1215/// it checks if the primaries list matches and updates it if necessary.
1216///
1217/// # Arguments
1218///
1219/// * `client` - Kubernetes API client
1220/// * `dnszone` - The `DNSZone` resource
1221/// * `zone_manager` - BIND9 manager for adding zone
1222/// * `primary_ips` - List of primary server IPs to configure in the primaries field
1223///
1224/// # Returns
1225///
1226/// * `Ok(usize)` - Number of secondary endpoints successfully configured
1227/// * `Err(_)` - If zone addition failed
1228///
1229/// # Errors
1230///
1231/// Returns an error if BIND9 zone addition fails on any secondary instance.
1232///
1233/// # Panics
1234///
1235/// Panics if internal Arc unwrapping fails (should not happen in normal operation).
1236#[allow(clippy::too_many_lines)]
1237pub async fn add_dnszone_to_secondaries(
1238    ctx: Arc<crate::context::Context>,
1239    dnszone: DNSZone,
1240    zone_manager: &crate::bind9::Bind9Manager,
1241    primary_ips: &[String],
1242    status_updater: &mut crate::reconcilers::status::DNSZoneStatusUpdater,
1243    instance_refs: &[crate::crd::InstanceReference],
1244) -> Result<usize> {
1245    let client = ctx.client.clone();
1246    let namespace = dnszone.namespace().unwrap_or_default();
1247    let name = dnszone.name_any();
1248    let spec = &dnszone.spec;
1249
1250    if primary_ips.is_empty() {
1251        warn!(
1252            "No primary IPs provided for secondary zone {}/{} - skipping secondary configuration",
1253            namespace, spec.zone_name
1254        );
1255        return Ok(0);
1256    }
1257
1258    info!(
1259        "Adding DNSZone {}/{} to secondary instances with primaries: {:?}",
1260        namespace, name, primary_ips
1261    );
1262
1263    // PHASE 2 OPTIMIZATION: Use the filtered instance list passed by the caller
1264    // This ensures we only process instances that need reconciliation (lastReconciledAt == None)
1265
1266    // Filter to only SECONDARY instances
1267    let secondary_instance_refs =
1268        secondary::filter_secondary_instances(&client, instance_refs).await?;
1269
1270    if secondary_instance_refs.is_empty() {
1271        info!(
1272            "No secondary instances found for DNSZone {}/{} - skipping secondary zone configuration",
1273            namespace, name
1274        );
1275        return Ok(0);
1276    }
1277
1278    info!(
1279        "Found {} secondary instance(s) for DNSZone {}/{}",
1280        secondary_instance_refs.len(),
1281        namespace,
1282        name
1283    );
1284
1285    // Process all secondary instances concurrently using async streams
1286    // Mark each instance as reconciled immediately after first successful endpoint configuration
1287    let total_endpoints = Arc::new(Mutex::new(0_usize));
1288    let errors = Arc::new(Mutex::new(Vec::<String>::new()));
1289    let status_updater_shared = Arc::new(Mutex::new(status_updater));
1290
1291    // Create a stream of futures for all secondary instances
1292    let _instance_results = stream::iter(secondary_instance_refs.iter())
1293        .then(|instance_ref| {
1294            let client = client.clone();
1295            let zone_manager = zone_manager.clone();
1296            let zone_name = spec.zone_name.clone();
1297            let primary_ips = primary_ips.to_vec();
1298            let total_endpoints = Arc::clone(&total_endpoints);
1299            let errors = Arc::clone(&errors);
1300            let status_updater_shared = Arc::clone(&status_updater_shared);
1301            let instance_ref = instance_ref.clone();
1302            let _zone_namespace = namespace.clone();
1303            let _zone_name_ref = name.clone();
1304
1305            async move {
1306                info!(
1307                    "Processing secondary instance {}/{} for zone {}",
1308                    instance_ref.namespace, instance_ref.name, zone_name
1309                );
1310
1311                // Load RNDC key for this specific instance
1312                // Each instance has its own RNDC secret for security isolation
1313                let key_data = match helpers::load_rndc_key(&client, &instance_ref.namespace, &instance_ref.name).await {
1314                    Ok(key) => key,
1315                    Err(e) => {
1316                        let err_msg = format!("instance {}/{}: failed to load RNDC key: {e}", instance_ref.namespace, instance_ref.name);
1317                        errors.lock().await.push(err_msg);
1318                        return;
1319                    }
1320                };
1321
1322                // Get all endpoints for this secondary instance
1323                let endpoints = match helpers::get_endpoint(&client, &instance_ref.namespace, &instance_ref.name, "http").await {
1324                    Ok(eps) => eps,
1325                    Err(e) => {
1326                        let err_msg = format!("instance {}/{}: failed to get endpoints: {e}", instance_ref.namespace, instance_ref.name);
1327                        errors.lock().await.push(err_msg);
1328                        return;
1329                    }
1330                };
1331
1332                info!(
1333                    "Found {} endpoint(s) for secondary instance {}/{}",
1334                    endpoints.len(),
1335                    instance_ref.namespace,
1336                    instance_ref.name
1337                );
1338
1339                // Process endpoints concurrently for this instance
1340                let endpoint_results = stream::iter(endpoints.iter())
1341                    .then(|endpoint| {
1342                        let zone_manager = zone_manager.clone();
1343                        let zone_name = zone_name.clone();
1344                        let key_data = key_data.clone();
1345                        let primary_ips = primary_ips.clone();
1346                        let total_endpoints = Arc::clone(&total_endpoints);
1347                        let errors = Arc::clone(&errors);
1348                        let instance_ref = instance_ref.clone();
1349                        let endpoint = endpoint.clone();
1350
1351                        async move {
1352                            let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1353
1354                            // Check if zone already exists before attempting creation
1355                            let zone_exists = match zone_manager.zone_exists(&zone_name, &pod_endpoint).await {
1356                                Ok(exists) => exists,
1357                                Err(e) => {
1358                                    error!(
1359                                        "Failed to check if zone {} exists on endpoint {} (instance {}/{}): {}",
1360                                        zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1361                                    );
1362                                    // Treat errors as "zone might not exist" - proceed with add_zones
1363                                    false
1364                                }
1365                            };
1366
1367                            // Variable to track if zone was added
1368                            let was_added = if zone_exists {
1369                                debug!(
1370                                    "Secondary zone {} already exists on endpoint {} (instance {}/{}), skipping creation",
1371                                    zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1372                                );
1373                                *total_endpoints.lock().await += 1;
1374                                false // Zone not newly added
1375                            } else {
1376                                info!(
1377                                    "Adding secondary zone {} to endpoint {} (instance: {}/{}) with primaries: {:?}",
1378                                    zone_name,
1379                                    pod_endpoint,
1380                                    instance_ref.namespace,
1381                                    instance_ref.name,
1382                                    primary_ips
1383                                );
1384
1385                                match zone_manager
1386                                    .add_zones(
1387                                        &zone_name,
1388                                        ZONE_TYPE_SECONDARY,
1389                                        &pod_endpoint,
1390                                        &key_data,
1391                                        None, // No SOA record for secondary zones
1392                                        None, // No name_servers for secondary zones
1393                                        None, // No name_server_ips for secondary zones
1394                                        None, // No secondary_ips for secondary zones
1395                                        Some(&primary_ips),
1396                                        None, // No DNSSEC policy for secondary zones
1397                                    )
1398                                    .await
1399                                {
1400                                    Ok(added) => {
1401                                        if added {
1402                                            info!(
1403                                                "Successfully added secondary zone {} to endpoint {} (instance: {}/{})",
1404                                                zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1405                                            );
1406                                        } else {
1407                                            info!(
1408                                                "Secondary zone {} already exists on endpoint {} (instance: {}/{})",
1409                                                zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1410                                            );
1411                                        }
1412                                        *total_endpoints.lock().await += 1;
1413                                        added
1414                                    }
1415                                    Err(e) => {
1416                                        error!(
1417                                            "Failed to add secondary zone {} to endpoint {} (instance {}/{}): {}",
1418                                            zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1419                                        );
1420                                        errors.lock().await.push(format!(
1421                                            "endpoint {pod_endpoint} (instance {}/{}): {e}",
1422                                            instance_ref.namespace, instance_ref.name
1423                                        ));
1424                                        return Err(());
1425                                    }
1426                                }
1427                            };
1428
1429                            // CRITICAL: Immediately trigger zone transfer to load the zone data
1430                            // This is necessary because:
1431                            // 1. `rndc addzone` only adds the zone to BIND9's config (in-memory)
1432                            // 2. The zone file doesn't exist yet on the secondary
1433                            // 3. Queries will return SERVFAIL until data is transferred from primary
1434                            // 4. `rndc retransfer` forces an immediate AXFR from primary to secondary
1435                            //
1436                            // This ensures the zone is LOADED and SERVING queries immediately after
1437                            // secondary pod restart or zone creation.
1438                            // NOTE: We trigger transfer even if zone already existed to ensure it's up to date
1439                            info!(
1440                                "Triggering immediate zone transfer for {} on secondary {} to load zone data",
1441                                zone_name, pod_endpoint
1442                            );
1443                            if let Err(e) = zone_manager
1444                                .retransfer_zone(&zone_name, &pod_endpoint)
1445                                .await
1446                            {
1447                                // Don't fail reconciliation if retransfer fails - zone will sync via SOA refresh
1448                                warn!(
1449                                    "Failed to trigger immediate zone transfer for {} on {}: {}. Zone will sync via SOA refresh timer.",
1450                                    zone_name, pod_endpoint, e
1451                                );
1452                            } else {
1453                                info!(
1454                                    "Successfully triggered zone transfer for {} on {}",
1455                                    zone_name, pod_endpoint
1456                                );
1457                            }
1458
1459                            // Return was_added so we can check if zone was actually configured
1460                            Ok(was_added)
1461                        }
1462                    })
1463                    .collect::<Vec<Result<bool, ()>>>()
1464                    .await;
1465
1466                // Mark this instance as configured ONLY if at least one endpoint actually added the zone
1467                // This prevents updating lastReconciledAt when zone already exists (avoids tight loop)
1468                let zone_was_configured = endpoint_results.iter().any(|r| r.is_ok() && *r.as_ref().unwrap());
1469                if zone_was_configured {
1470                    status_updater_shared
1471                        .lock()
1472                        .await
1473                        .update_instance_status(
1474                            &instance_ref.name,
1475                            &instance_ref.namespace,
1476                            crate::crd::InstanceStatus::Configured,
1477                            Some("Zone successfully configured on secondary instance".to_string()),
1478                        );
1479                    info!(
1480                        "Marked secondary instance {}/{} as configured for zone {}",
1481                        instance_ref.namespace, instance_ref.name, zone_name
1482                    );
1483
1484                    // PHASE 2 COMPLETION: Update Bind9Instance.status.selectedZones[].lastReconciledAt
1485                    // This signals successful zone configuration and prevents infinite reconciliation loops
1486                    // STUB: No longer needed - function is a no-op
1487                    // update_zone_reconciled_timestamp(
1488                    //     &client,
1489                    //     &instance_ref.name,
1490                    //     &instance_ref.namespace,
1491                    //     &zone_name_ref,
1492                    //     &zone_namespace,
1493                    // );
1494                }
1495            }
1496        })
1497        .collect::<Vec<()>>()
1498        .await;
1499
1500    let total_endpoints = Arc::try_unwrap(total_endpoints).unwrap().into_inner();
1501    let errors = Arc::try_unwrap(errors).unwrap().into_inner();
1502
1503    // If ALL operations failed, return an error
1504    if total_endpoints == 0 && !errors.is_empty() {
1505        return Err(anyhow!(
1506            "Failed to add zone {} to all secondary instances. Errors: {}",
1507            spec.zone_name,
1508            errors.join("; ")
1509        ));
1510    }
1511
1512    info!(
1513        "Successfully configured secondary zone {} on {} endpoint(s) across {} secondary instance(s)",
1514        spec.zone_name,
1515        total_endpoints,
1516        secondary_instance_refs.len()
1517    );
1518
1519    Ok(total_endpoints)
1520}
1521
1522/// Deletes a DNS zone and its associated zone files.
1523///
1524/// # Arguments
1525///
1526/// * `_client` - Kubernetes API client (unused, for future extensions)
1527/// * `dnszone` - The `DNSZone` resource to delete
1528/// * `zone_manager` - BIND9 manager for removing zone files
1529///
1530/// # Returns
1531///
1532/// * `Ok(())` - If zone was deleted successfully
1533/// * `Err(_)` - If zone deletion failed
1534///
1535/// # Errors
1536///
1537/// Returns an error if BIND9 zone deletion fails.
1538pub async fn delete_dnszone(
1539    ctx: Arc<crate::context::Context>,
1540    dnszone: DNSZone,
1541    zone_manager: &crate::bind9::Bind9Manager,
1542) -> Result<()> {
1543    let client = ctx.client.clone();
1544    let bind9_instances_store = &ctx.stores.bind9_instances;
1545    let namespace = dnszone.namespace().unwrap_or_default();
1546    let name = dnszone.name_any();
1547    let spec = &dnszone.spec;
1548
1549    info!("Deleting DNSZone {}/{}", namespace, name);
1550
1551    // Get instances from new architecture (spec.bind9Instances or status.bind9Instances)
1552    // If zone has no instances assigned (e.g., orphaned zone), still allow deletion
1553    let instance_refs = match validation::get_instances_from_zone(&dnszone, bind9_instances_store) {
1554        Ok(refs) => refs,
1555        Err(e) => {
1556            warn!(
1557                "DNSZone {}/{} has no instances assigned: {}. Allowing deletion anyway.",
1558                namespace, name, e
1559            );
1560            return Ok(());
1561        }
1562    };
1563
1564    // Filter to primary and secondary instances
1565    let primary_instance_refs = primary::filter_primary_instances(&client, &instance_refs).await?;
1566    let secondary_instance_refs =
1567        secondary::filter_secondary_instances(&client, &instance_refs).await?;
1568
1569    // Delete from all primary instances
1570    if !primary_instance_refs.is_empty() {
1571        let (_first_endpoint, total_endpoints) = helpers::for_each_instance_endpoint(
1572            &client,
1573            &primary_instance_refs,
1574            false, // with_rndc_key = false for zone deletion
1575            "http", // Use HTTP API port for zone deletion via bindcar API
1576            |pod_endpoint, instance_name, _rndc_key| {
1577                let zone_name = spec.zone_name.clone();
1578                let zone_manager = zone_manager.clone();
1579
1580                async move {
1581                    info!(
1582                        "Deleting zone {} from endpoint {} (instance: {})",
1583                        zone_name, pod_endpoint, instance_name
1584                    );
1585
1586                    // Attempt to delete zone - if it fails (zone not found, endpoint unreachable, etc.),
1587                    // log a warning but don't fail the deletion. This ensures DNSZones can be deleted
1588                    // even if BIND9 instances are unavailable or the zone was already removed.
1589                    // Pass freeze_before_delete=true for primary zones to prevent updates during deletion
1590                    if let Err(e) = zone_manager.delete_zone(&zone_name, &pod_endpoint, true).await {
1591                        warn!(
1592                            "Failed to delete zone {} from endpoint {} (instance: {}): {}. Continuing with deletion anyway.",
1593                            zone_name, pod_endpoint, instance_name, e
1594                        );
1595                    } else {
1596                        debug!(
1597                            "Successfully deleted zone {} from endpoint {} (instance: {})",
1598                            zone_name, pod_endpoint, instance_name
1599                        );
1600                    }
1601
1602                    Ok(())
1603                }
1604            },
1605        )
1606        .await?;
1607
1608        info!(
1609            "Successfully deleted zone {} from {} primary endpoint(s)",
1610            spec.zone_name, total_endpoints
1611        );
1612    }
1613
1614    // Delete from all secondary instances
1615    if !secondary_instance_refs.is_empty() {
1616        let mut secondary_endpoints_deleted = 0;
1617
1618        for instance_ref in &secondary_instance_refs {
1619            let endpoints =
1620                helpers::get_endpoint(&client, &instance_ref.namespace, &instance_ref.name, "http")
1621                    .await?;
1622
1623            for endpoint in &endpoints {
1624                let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1625
1626                info!(
1627                    "Deleting zone {} from secondary endpoint {} (instance: {}/{})",
1628                    spec.zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1629                );
1630
1631                // Attempt to delete zone - if it fails, log a warning but don't fail the deletion
1632                // Pass freeze_before_delete=false for secondary zones (they are read-only, no need to freeze)
1633                if let Err(e) = zone_manager
1634                    .delete_zone(&spec.zone_name, &pod_endpoint, false)
1635                    .await
1636                {
1637                    warn!(
1638                        "Failed to delete zone {} from secondary endpoint {} (instance: {}/{}): {}. Continuing with deletion anyway.",
1639                        spec.zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1640                    );
1641                } else {
1642                    debug!(
1643                        "Successfully deleted zone {} from secondary endpoint {} (instance: {}/{})",
1644                        spec.zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1645                    );
1646                    secondary_endpoints_deleted += 1;
1647                }
1648            }
1649        }
1650
1651        info!(
1652            "Successfully deleted zone {} from {} secondary endpoint(s)",
1653            spec.zone_name, secondary_endpoints_deleted
1654        );
1655    }
1656
1657    // Note: We don't need to reload after delzone because:
1658    // 1. rndc delzone immediately removes the zone from BIND9's running config
1659    // 2. BIND9 will clean up the zone file and journal files automatically
1660
1661    Ok(())
1662}
1663
1664/// Auto-generates NS records for all nameservers in the zone.
1665///
1666/// This function is called after zone creation to add NS records for secondary nameservers
1667/// specified in the `nameServers` field. The primary nameserver NS record is already created
1668/// by bindcar during zone initialization (from SOA).
1669///
1670/// # Arguments
1671/// * `client` - Kubernetes client for loading RNDC keys and getting endpoints
1672/// * `effective_name_servers` - List of nameservers from `nameServers` field
1673/// * `zone_name` - The DNS zone name
1674/// * `ttl` - TTL for the NS and glue records
1675/// * `primary_instance_refs` - List of primary instances to update
1676///
1677/// # Returns
1678/// Result indicating success or failure
1679///
1680/// # Errors
1681/// Returns error if NS record or glue record addition fails
1682#[allow(clippy::too_many_lines)]
1683async fn auto_generate_ns_records(
1684    client: &kube::Client,
1685    effective_name_servers: &[crate::crd::NameServer],
1686    zone_name: &str,
1687    ttl: Option<i32>,
1688    primary_instance_refs: &[crate::crd::InstanceReference],
1689) -> Result<()> {
1690    if effective_name_servers.is_empty() {
1691        return Ok(());
1692    }
1693
1694    info!(
1695        "Auto-generating {} NS record(s) for zone {}",
1696        effective_name_servers.len(),
1697        zone_name
1698    );
1699
1700    for nameserver in effective_name_servers {
1701        // Add NS record at zone apex (@)
1702        info!(
1703            "Adding NS record: {} IN NS {}",
1704            zone_name, nameserver.hostname
1705        );
1706
1707        for instance_ref in primary_instance_refs {
1708            // Load RNDC key for this instance
1709            let key_data = match helpers::load_rndc_key(
1710                client,
1711                &instance_ref.namespace,
1712                &instance_ref.name,
1713            )
1714            .await
1715            {
1716                Ok(key) => key,
1717                Err(e) => {
1718                    warn!(
1719                        "Failed to load RNDC key for instance {}/{}: {}. Skipping NS record addition.",
1720                        instance_ref.namespace, instance_ref.name, e
1721                    );
1722                    continue;
1723                }
1724            };
1725
1726            // Get endpoints for this instance
1727            let endpoints = match helpers::get_endpoint(
1728                client,
1729                &instance_ref.namespace,
1730                &instance_ref.name,
1731                "dns-tcp",
1732            )
1733            .await
1734            {
1735                Ok(eps) => eps,
1736                Err(e) => {
1737                    warn!(
1738                        "Failed to get endpoints for instance {}/{}: {}. Skipping NS record addition.",
1739                        instance_ref.namespace, instance_ref.name, e
1740                    );
1741                    continue;
1742                }
1743            };
1744
1745            // Add NS record to all endpoints of this instance
1746            for endpoint in &endpoints {
1747                let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1748
1749                if let Err(e) = crate::bind9::records::ns::add_ns_record(
1750                    zone_name,
1751                    "@", // Zone apex
1752                    &nameserver.hostname,
1753                    ttl,
1754                    &pod_endpoint,
1755                    &key_data,
1756                )
1757                .await
1758                {
1759                    warn!(
1760                        "Failed to add NS record for {} to endpoint {} (instance {}/{}): {}",
1761                        nameserver.hostname,
1762                        pod_endpoint,
1763                        instance_ref.namespace,
1764                        instance_ref.name,
1765                        e
1766                    );
1767                    // Continue with other endpoints - partial success is acceptable
1768                }
1769            }
1770        }
1771
1772        // Add glue records if IPs provided (for in-zone nameservers)
1773        if let Some(ref ipv4) = nameserver.ipv4_address {
1774            add_glue_record(
1775                client,
1776                zone_name,
1777                &nameserver.hostname,
1778                ipv4,
1779                hickory_client::rr::RecordType::A,
1780                ttl,
1781                primary_instance_refs,
1782            )
1783            .await?;
1784        }
1785
1786        if let Some(ref ipv6) = nameserver.ipv6_address {
1787            add_glue_record(
1788                client,
1789                zone_name,
1790                &nameserver.hostname,
1791                ipv6,
1792                hickory_client::rr::RecordType::AAAA,
1793                ttl,
1794                primary_instance_refs,
1795            )
1796            .await?;
1797        }
1798    }
1799
1800    info!(
1801        "Successfully auto-generated NS records and glue records for zone {}",
1802        zone_name
1803    );
1804
1805    Ok(())
1806}
1807
1808/// Adds a glue record (A or AAAA) for an in-zone nameserver.
1809///
1810/// Glue records provide IP addresses for nameservers within the zone's own domain.
1811/// This is necessary to avoid circular dependencies when resolving the nameserver itself.
1812///
1813/// # Arguments
1814/// * `client` - Kubernetes client for loading RNDC keys and getting endpoints
1815/// * `zone_name` - The DNS zone name
1816/// * `hostname` - Full nameserver hostname (e.g., "ns2.example.com.")
1817/// * `ip_address` - IP address (IPv4 or IPv6)
1818/// * `record_type` - Type of glue record (A or AAAA)
1819/// * `ttl` - TTL for the glue record
1820/// * `primary_instance_refs` - List of primary instances to update
1821///
1822/// # Returns
1823/// Result indicating success or failure
1824///
1825/// # Errors
1826/// Returns error if glue record addition fails on all instances
1827#[allow(clippy::too_many_lines)]
1828async fn add_glue_record(
1829    client: &kube::Client,
1830    zone_name: &str,
1831    hostname: &str,
1832    ip_address: &str,
1833    record_type: hickory_client::rr::RecordType,
1834    ttl: Option<i32>,
1835    primary_instance_refs: &[crate::crd::InstanceReference],
1836) -> Result<()> {
1837    // Extract record name from hostname
1838    // Example: "ns2.example.com." in zone "example.com" → name = "ns2"
1839    let name = hostname
1840        .trim_end_matches('.')
1841        .strip_suffix(&format!(".{}", zone_name.trim_end_matches('.')))
1842        .unwrap_or_else(|| hostname.trim_end_matches('.'));
1843
1844    // Check if this is actually an in-zone nameserver
1845    if name == hostname.trim_end_matches('.') {
1846        // Hostname doesn't end with zone name - this is an out-of-zone nameserver
1847        // No glue record needed
1848        debug!(
1849            "Skipping glue record for out-of-zone nameserver {} (not in zone {})",
1850            hostname, zone_name
1851        );
1852        return Ok(());
1853    }
1854
1855    info!(
1856        "Adding {} glue record: {} IN {} {}",
1857        if record_type == hickory_client::rr::RecordType::A {
1858            "A"
1859        } else {
1860            "AAAA"
1861        },
1862        name,
1863        if record_type == hickory_client::rr::RecordType::A {
1864            "A"
1865        } else {
1866            "AAAA"
1867        },
1868        ip_address
1869    );
1870
1871    let mut success_count = 0;
1872    let mut errors = Vec::new();
1873
1874    for instance_ref in primary_instance_refs {
1875        // Load RNDC key for this instance
1876        let key_data = match helpers::load_rndc_key(
1877            client,
1878            &instance_ref.namespace,
1879            &instance_ref.name,
1880        )
1881        .await
1882        {
1883            Ok(key) => key,
1884            Err(e) => {
1885                warn!(
1886                    "Failed to load RNDC key for instance {}/{}: {}. Skipping glue record addition.",
1887                    instance_ref.namespace, instance_ref.name, e
1888                );
1889                continue;
1890            }
1891        };
1892
1893        // Get endpoints for this instance
1894        let endpoints = match helpers::get_endpoint(
1895            client,
1896            &instance_ref.namespace,
1897            &instance_ref.name,
1898            "dns-tcp",
1899        )
1900        .await
1901        {
1902            Ok(eps) => eps,
1903            Err(e) => {
1904                warn!(
1905                    "Failed to get endpoints for instance {}/{}: {}. Skipping glue record addition.",
1906                    instance_ref.namespace, instance_ref.name, e
1907                );
1908                continue;
1909            }
1910        };
1911
1912        // Add glue record to all endpoints of this instance
1913        for endpoint in &endpoints {
1914            let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1915
1916            let result = match record_type {
1917                hickory_client::rr::RecordType::A => {
1918                    crate::bind9::records::a::add_a_record(
1919                        zone_name,
1920                        name,
1921                        &[ip_address.to_string()],
1922                        ttl,
1923                        &pod_endpoint,
1924                        &key_data,
1925                    )
1926                    .await
1927                }
1928                hickory_client::rr::RecordType::AAAA => {
1929                    crate::bind9::records::a::add_aaaa_record(
1930                        zone_name,
1931                        name,
1932                        &[ip_address.to_string()],
1933                        ttl,
1934                        &pod_endpoint,
1935                        &key_data,
1936                    )
1937                    .await
1938                }
1939                _ => {
1940                    return Err(anyhow::anyhow!(
1941                        "Invalid record type for glue record: {:?}",
1942                        record_type
1943                    ))
1944                }
1945            };
1946
1947            match result {
1948                Ok(()) => {
1949                    success_count += 1;
1950                }
1951                Err(e) => {
1952                    warn!(
1953                        "Failed to add glue record {} to endpoint {} (instance {}/{}): {}",
1954                        name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1955                    );
1956                    errors.push(format!(
1957                        "endpoint {} (instance {}/{}): {}",
1958                        pod_endpoint, instance_ref.namespace, instance_ref.name, e
1959                    ));
1960                }
1961            }
1962        }
1963    }
1964
1965    // Accept partial success - at least one endpoint updated
1966    if success_count > 0 {
1967        Ok(())
1968    } else {
1969        Err(anyhow::anyhow!(
1970            "Failed to add glue record {} to all instances. Errors: {}",
1971            name,
1972            errors.join("; ")
1973        ))
1974    }
1975}
1976
1977#[cfg(test)]
1978#[path = "dnszone_tests.rs"]
1979mod dnszone_tests;