bindy/reconcilers/dnszone.rs
1// Copyright (c) 2025 Erick Bourgeois, firestoned
2#![allow(clippy::uninlined_format_args)]
3#![allow(clippy::doc_markdown)]
4// SPDX-License-Identifier: MIT
5
6//! DNS zone reconciliation logic.
7//!
8//! This module handles the creation and management of DNS zones on BIND9 servers.
9//! It supports both primary and secondary zone configurations.
10
11// Module imports
12pub mod bind9_config;
13pub mod cleanup;
14pub mod constants;
15pub mod discovery;
16pub mod helpers;
17pub mod primary;
18pub mod secondary;
19pub mod status_helpers;
20pub mod types;
21pub mod validation;
22
23#[cfg(test)]
24#[path = "dnszone/helpers_tests.rs"]
25mod helpers_tests;
26
27// Bind9Instance and InstanceReferenceWithStatus are used by dead_code marked functions (Phase 2 cleanup)
28use self::types::DuplicateZoneInfo;
29#[allow(unused_imports)]
30use crate::crd::{Condition, DNSZone, DNSZoneStatus};
31use anyhow::{anyhow, Result};
32use bindcar::{ZONE_TYPE_PRIMARY, ZONE_TYPE_SECONDARY};
33use futures::stream::{self, StreamExt};
34use k8s_openapi::api::core::v1::{Pod, Service};
35use kube::{api::ListParams, client::Client, Api, ResourceExt};
36use std::collections::HashMap;
37use std::sync::Arc;
38use tokio::sync::Mutex;
39use tracing::{debug, error, info, warn};
40
41/// Creates a map of nameserver hostnames to IP addresses by:
42/// 1. Checking for Service external IPs first (`LoadBalancer` or `NodePort`)
43/// 2. Falling back to pod IPs if no external IPs are available
44///
45/// Nameservers are named: `ns1.{zone_name}.`, `ns2.{zone_name}.`, etc.
46/// Order: Primary instances first, then secondary instances.
47///
48/// # Arguments
49///
50/// * `client` - Kubernetes API client
51/// * `zone_name` - DNS zone name (e.g., "example.com")
52/// * `instance_refs` - All instance references (primaries and secondaries)
53///
54/// # Returns
55///
56/// `HashMap` of nameserver hostnames to IP addresses, or None if no IPs found
57///
58/// # Errors
59///
60/// Returns an error if Kubernetes API calls fail.
61pub async fn generate_nameserver_ips(
62 client: &Client,
63 zone_name: &str,
64 instance_refs: &[crate::crd::InstanceReference],
65) -> Result<Option<HashMap<String, String>>> {
66 if instance_refs.is_empty() {
67 return Ok(None);
68 }
69
70 let mut nameserver_ips = HashMap::new();
71 let mut ns_index = 1;
72
73 // Process primaries first, then secondaries
74 for instance_ref in instance_refs {
75 // Try to get Service external IP first
76 let service_api: Api<Service> = Api::namespaced(client.clone(), &instance_ref.namespace);
77
78 let ip = match service_api.get(&instance_ref.name).await {
79 Ok(service) => {
80 // Check for LoadBalancer external IP
81 if let Some(status) = &service.status {
82 if let Some(load_balancer) = &status.load_balancer {
83 if let Some(ingress_list) = &load_balancer.ingress {
84 if let Some(ingress) = ingress_list.first() {
85 if let Some(lb_ip) = &ingress.ip {
86 debug!(
87 "Using LoadBalancer IP {} for instance {}/{}",
88 lb_ip, instance_ref.namespace, instance_ref.name
89 );
90 Some(lb_ip.clone())
91 } else {
92 None
93 }
94 } else {
95 None
96 }
97 } else {
98 None
99 }
100 } else {
101 None
102 }
103 } else {
104 None
105 }
106 }
107 Err(e) => {
108 debug!(
109 "Failed to get service for instance {}/{}: {}. Will try pod IP.",
110 instance_ref.namespace, instance_ref.name, e
111 );
112 None
113 }
114 };
115
116 // If no service external IP, fallback to pod IP
117 let ip = if ip.is_none() {
118 // Get pod IP
119 let pod_api: Api<Pod> = Api::namespaced(client.clone(), &instance_ref.namespace);
120 let label_selector = format!("app=bind9,instance={}", instance_ref.name);
121 let lp = ListParams::default().labels(&label_selector);
122
123 match pod_api.list(&lp).await {
124 Ok(pods) => {
125 // Find first running pod
126 pods.items
127 .iter()
128 .find(|pod| {
129 let phase = pod
130 .status
131 .as_ref()
132 .and_then(|s| s.phase.as_ref())
133 .map_or("Unknown", std::string::String::as_str);
134 phase == "Running"
135 })
136 .and_then(|pod| {
137 pod.status
138 .as_ref()
139 .and_then(|s| s.pod_ip.as_ref())
140 .map(|ip| {
141 debug!(
142 "Using pod IP {} for instance {}/{}",
143 ip, instance_ref.namespace, instance_ref.name
144 );
145 ip.clone()
146 })
147 })
148 }
149 Err(e) => {
150 warn!(
151 "Failed to list pods for instance {}/{}: {}. Skipping.",
152 instance_ref.namespace, instance_ref.name, e
153 );
154 None
155 }
156 }
157 } else {
158 ip
159 };
160
161 // Add to nameserver map if we found an IP
162 if let Some(ip) = ip {
163 let ns_hostname = format!("ns{ns_index}.{zone_name}.");
164 nameserver_ips.insert(ns_hostname, ip);
165 ns_index += 1;
166 }
167 }
168
169 if nameserver_ips.is_empty() {
170 Ok(None)
171 } else {
172 Ok(Some(nameserver_ips))
173 }
174}
175
176/// Get the effective nameservers list for a DNSZone, handling both new and deprecated fields.
177///
178/// This function provides backward compatibility by:
179/// 1. Preferring the new `name_servers` field if present
180/// 2. Falling back to the deprecated `name_server_ips` field with automatic migration
181/// 3. Logging deprecation warnings when the old field is used
182///
183/// # Arguments
184/// * `spec` - The DNSZone spec containing nameserver configuration
185///
186/// # Returns
187/// `Option<Vec<NameServer>>` - The effective list of nameservers, or `None` if neither field is set
188///
189/// # Examples
190///
191/// ```
192/// # #[allow(deprecated)]
193/// # use bindy::crd::{DNSZoneSpec, NameServer, SOARecord};
194/// # use std::collections::HashMap;
195/// // New field takes precedence
196/// let spec = DNSZoneSpec {
197/// zone_name: "example.com".into(),
198/// soa_record: SOARecord {
199/// primary_ns: "ns1.example.com.".into(),
200/// admin_email: "admin.example.com.".into(),
201/// serial: 1,
202/// refresh: 3600,
203/// retry: 600,
204/// expire: 604800,
205/// negative_ttl: 86400,
206/// },
207/// ttl: None,
208/// cluster_ref: None,
209/// name_servers: Some(vec![NameServer {
210/// hostname: "ns2.example.com.".into(),
211/// ipv4_address: None,
212/// ipv6_address: None,
213/// }]),
214/// name_server_ips: Some(HashMap::from([("ns3.example.com.".into(), "192.0.2.3".into())])),
215/// records_from: None,
216/// bind9_instances_from: None,
217/// dnssec_policy: None,
218/// };
219/// // Returns name_servers (new field), ignoring name_server_ips
220/// ```
221fn get_effective_name_servers(
222 spec: &crate::crd::DNSZoneSpec,
223) -> Option<Vec<crate::crd::NameServer>> {
224 use crate::crd::NameServer;
225
226 // New field takes precedence
227 if let Some(ref new_servers) = spec.name_servers {
228 debug!(
229 "Using new `nameServers` field with {} server(s)",
230 new_servers.len()
231 );
232 return Some(new_servers.clone());
233 }
234
235 // Fallback to deprecated field with migration warning
236 #[allow(deprecated)]
237 if let Some(ref old_ips) = spec.name_server_ips {
238 warn!(
239 "DNSZone uses deprecated `nameServerIps` field. \
240 Migrate to `nameServers` for better functionality and IPv6 support. \
241 See migration guide at docs/src/operations/migration-guide.md"
242 );
243
244 // Convert HashMap<String, String> to Vec<NameServer>
245 // Old format: {"ns2.example.com.": "192.0.2.2"}
246 // New format: vec![NameServer { hostname: "ns2.example.com.", ipv4_address: Some("192.0.2.2"), .. }]
247 let servers: Vec<NameServer> = old_ips
248 .iter()
249 .map(|(hostname, ip)| NameServer {
250 hostname: hostname.clone(),
251 ipv4_address: Some(ip.clone()),
252 ipv6_address: None, // Old field doesn't support IPv6
253 })
254 .collect();
255
256 debug!(
257 "Migrated {} server(s) from deprecated `nameServerIps` to new format",
258 servers.len()
259 );
260
261 return Some(servers);
262 }
263
264 // Neither field set
265 None
266}
267
268/// Re-fetch a DNSZone to get the latest status.
269///
270/// The `dnszone` parameter from the watch event might have stale status from the cache.
271/// We need the latest `status.bind9Instances` which may have been updated by the
272/// Bind9Instance reconciler.
273///
274/// # Arguments
275/// * `client` - Kubernetes client
276/// * `namespace` - Namespace of the DNSZone
277/// * `name` - Name of the DNSZone
278///
279/// # Returns
280/// The freshly fetched DNSZone with current status
281///
282/// # Errors
283/// Returns an error if the Kubernetes API call fails
284async fn refetch_zone(client: &kube::Client, namespace: &str, name: &str) -> Result<DNSZone> {
285 let zones_api: Api<DNSZone> = Api::namespaced(client.clone(), namespace);
286 let zone = zones_api.get(name).await?;
287 Ok(zone)
288}
289
290/// Handle duplicate zone conflicts by setting Ready=False and stopping reconciliation.
291///
292/// When a duplicate zone is detected, this function:
293/// 1. Logs a warning with details about the conflict
294/// 2. Updates the status with Ready=False and DuplicateZone condition
295/// 3. Applies the status to the API server
296///
297/// # Arguments
298/// * `client` - Kubernetes client
299/// * `namespace` - Namespace of the conflicting DNSZone
300/// * `name` - Name of the conflicting DNSZone
301/// * `duplicate_info` - Information about the duplicate zone conflict
302/// * `status_updater` - Status updater to apply the condition
303///
304/// # Errors
305/// Returns an error if the status update fails
306async fn handle_duplicate_zone(
307 client: &kube::Client,
308 namespace: &str,
309 name: &str,
310 duplicate_info: &DuplicateZoneInfo,
311 status_updater: &mut crate::reconcilers::status::DNSZoneStatusUpdater,
312) -> Result<()> {
313 warn!(
314 "Duplicate zone detected: {}/{} cannot claim '{}' because it is already configured by: {:?}",
315 namespace, name, duplicate_info.zone_name, duplicate_info.conflicting_zones
316 );
317
318 // Build list of conflicting zones in namespace/name format
319 let conflicting_zone_refs: Vec<String> = duplicate_info
320 .conflicting_zones
321 .iter()
322 .map(|z| format!("{}/{}", z.namespace, z.name))
323 .collect();
324
325 // Set Ready=False with DuplicateZone reason
326 status_updater.set_duplicate_zone_condition(&duplicate_info.zone_name, &conflicting_zone_refs);
327
328 // Apply status and stop processing
329 status_updater.apply(client).await?;
330
331 Ok(())
332}
333
334/// Detect if the zone spec has changed since last reconciliation.
335///
336/// Compares current generation with observed generation to determine
337/// if this is first reconciliation or if spec changed.
338///
339/// # Arguments
340///
341/// * `zone` - The DNSZone resource
342///
343/// # Returns
344///
345/// Tuple of (first_reconciliation, spec_changed)
346fn detect_spec_changes(zone: &DNSZone) -> (bool, bool) {
347 let current_generation = zone.metadata.generation;
348 let observed_generation = zone.status.as_ref().and_then(|s| s.observed_generation);
349
350 let first_reconciliation = observed_generation.is_none();
351 let spec_changed =
352 crate::reconcilers::should_reconcile(current_generation, observed_generation);
353
354 (first_reconciliation, spec_changed)
355}
356
357/// Detect if the instance list changed between watch event and re-fetch.
358///
359/// This is critical for detecting when:
360/// 1. New instances are added to `status.bind9Instances` (via `bind9InstancesFrom` selectors)
361/// 2. Instance `lastReconciledAt` timestamps are cleared (e.g., instance deleted, needs reconfiguration)
362///
363/// NOTE: `InstanceReference` `PartialEq` ignores `lastReconciledAt`, so we must check timestamps separately!
364///
365/// # Arguments
366///
367/// * `namespace` - Namespace for logging
368/// * `name` - Zone name for logging
369/// * `watch_instances` - Instances from the watch event that triggered reconciliation
370/// * `current_instances` - Instances after re-fetching (current state)
371///
372/// # Returns
373///
374/// `true` if instances changed (list or timestamps), `false` otherwise
375fn detect_instance_changes(
376 namespace: &str,
377 name: &str,
378 watch_instances: Option<&Vec<crate::crd::InstanceReference>>,
379 current_instances: &[crate::crd::InstanceReference],
380) -> bool {
381 let Some(watch_instances) = watch_instances else {
382 // No instances in watch event, first reconciliation or error
383 return true;
384 };
385
386 // Get the instance names from the watch event (what triggered us)
387 let watch_instance_names: std::collections::HashSet<_> =
388 watch_instances.iter().map(|r| &r.name).collect();
389
390 // Get the instance names after re-fetching (current state)
391 let current_instance_names: std::collections::HashSet<_> =
392 current_instances.iter().map(|r| &r.name).collect();
393
394 // Check if instance list changed (added/removed instances)
395 let list_changed = watch_instance_names != current_instance_names;
396
397 if list_changed {
398 info!(
399 "Instance list changed during reconciliation for zone {}/{}: watch_event={:?}, current={:?}",
400 namespace, name, watch_instance_names, current_instance_names
401 );
402 return true;
403 }
404
405 // List is the same, but check if any lastReconciledAt timestamps changed
406 // Use InstanceReference as HashMap key (uses its Hash impl which hashes identity fields)
407 let watch_timestamps: std::collections::HashMap<&crate::crd::InstanceReference, Option<&str>> =
408 watch_instances
409 .iter()
410 .map(|inst| (inst, inst.last_reconciled_at.as_deref()))
411 .collect();
412
413 let current_timestamps: std::collections::HashMap<
414 &crate::crd::InstanceReference,
415 Option<&str>,
416 > = current_instances
417 .iter()
418 .map(|inst| (inst, inst.last_reconciled_at.as_deref()))
419 .collect();
420
421 let timestamps_changed = watch_timestamps.iter().any(|(inst_ref, watch_ts)| {
422 current_timestamps
423 .get(inst_ref)
424 .is_some_and(|current_ts| current_ts != watch_ts)
425 });
426
427 if timestamps_changed {
428 info!(
429 "Instance lastReconciledAt timestamps changed for zone {}/{}",
430 namespace, name
431 );
432 }
433
434 timestamps_changed
435}
436
437/// Reconciles a `DNSZone` resource.
438///
439/// Creates or updates DNS zone files on BIND9 instances that match the zone's
440/// instance selector. Supports both primary and secondary zone types.
441///
442/// # Zone Types
443///
444/// - **Primary**: Authoritative zone with SOA record and local zone file
445/// - **Secondary**: Replica zone that transfers from primary servers
446///
447/// # Arguments
448///
449/// * `client` - Kubernetes API client for finding matching `Bind9Instances`
450/// * `dnszone` - The `DNSZone` resource to reconcile
451/// * `zone_manager` - BIND9 manager for creating zone files
452///
453/// # Returns
454///
455/// * `Ok(())` - If zone was created/updated successfully
456/// * `Err(_)` - If zone creation failed or configuration is invalid
457///
458/// # Example
459///
460/// ```rust,no_run,ignore
461/// use bindy::reconcilers::reconcile_dnszone;
462/// use bindy::crd::DNSZone;
463/// use bindy::bind9::Bind9Manager;
464/// use bindy::context::Context;
465/// use std::sync::Arc;
466///
467/// async fn handle_zone(ctx: Arc<Context>, zone: DNSZone) -> anyhow::Result<()> {
468/// let manager = Bind9Manager::new();
469/// reconcile_dnszone(ctx, zone, &manager).await?;
470/// Ok(())
471/// }
472/// ```
473///
474/// # Errors
475///
476/// Returns an error if Kubernetes API operations fail or BIND9 zone operations fail.
477#[allow(clippy::too_many_lines)]
478pub async fn reconcile_dnszone(
479 ctx: Arc<crate::context::Context>,
480 dnszone: DNSZone,
481 zone_manager: &crate::bind9::Bind9Manager,
482) -> Result<()> {
483 let client = ctx.client.clone();
484 let bind9_instances_store = &ctx.stores.bind9_instances;
485
486 let namespace = dnszone.namespace().unwrap_or_default();
487 let name = dnszone.name_any();
488
489 info!("Reconciling DNSZone: {}/{}", namespace, name);
490 debug!(
491 namespace = %namespace,
492 name = %name,
493 generation = ?dnszone.metadata.generation,
494 "Starting DNSZone reconciliation"
495 );
496
497 // Save the instance list from the watch event (before re-fetching)
498 // This represents the instances that triggered this reconciliation
499 let watch_event_instances =
500 validation::get_instances_from_zone(&dnszone, bind9_instances_store).ok();
501
502 // CRITICAL: Re-fetch the zone to get the latest status
503 let dnszone = refetch_zone(&client, &namespace, &name).await?;
504
505 // Create centralized status updater to batch all status changes
506 let mut status_updater = crate::reconcilers::status::DNSZoneStatusUpdater::new(&dnszone);
507
508 // Extract spec
509 let spec = &dnszone.spec;
510
511 // Validate that zone has instances assigned (via spec.bind9Instances or status.bind9Instances)
512 // This will fail early if zone is not selected by any instance
513 let instance_refs = validation::get_instances_from_zone(&dnszone, bind9_instances_store)?;
514
515 info!(
516 "DNSZone {}/{} is assigned to {} instance(s): {:?}",
517 namespace,
518 name,
519 instance_refs.len(),
520 instance_refs.iter().map(|r| &r.name).collect::<Vec<_>>()
521 );
522
523 // CRITICAL: Check for duplicate zones BEFORE any configuration
524 // If another zone already claims this zone name, set Ready=False with DuplicateZone reason
525 // and stop processing to prevent conflicting DNS configurations
526 let zones_store = &ctx.stores.dnszones;
527 if let Some(duplicate_info) = validation::check_for_duplicate_zones(&dnszone, zones_store) {
528 handle_duplicate_zone(
529 &client,
530 &namespace,
531 &name,
532 &duplicate_info,
533 &mut status_updater,
534 )
535 .await?;
536 return Ok(());
537 }
538
539 // Determine if this is the first reconciliation or if spec has changed
540 let (first_reconciliation, spec_changed) = detect_spec_changes(&dnszone);
541
542 // Check if the instance list or lastReconciledAt timestamps changed between watch event and re-fetch
543 let instances_changed = detect_instance_changes(
544 &namespace,
545 &name,
546 watch_event_instances.as_ref(),
547 &instance_refs,
548 );
549
550 // Check if any instances need reconciliation (never reconciled or reconciliation failed)
551 let unreconciled_instances =
552 validation::filter_instances_needing_reconciliation(&instance_refs);
553 let has_unreconciled_instances = !unreconciled_instances.is_empty();
554
555 if has_unreconciled_instances {
556 info!(
557 "Found {} unreconciled instance(s) for zone {}/{}: {:?}",
558 unreconciled_instances.len(),
559 namespace,
560 name,
561 unreconciled_instances
562 .iter()
563 .map(|i| format!("{}/{}", i.namespace, i.name))
564 .collect::<Vec<_>>()
565 );
566 } else {
567 debug!(
568 "No unreconciled instances for zone {}/{} - all {} instance(s) already configured (lastReconciledAt set)",
569 namespace,
570 name,
571 instance_refs.len()
572 );
573 }
574
575 // CRITICAL: Cleanup deleted instances BEFORE early return check
576 // If we skip reconciliation due to no changes, we still need to remove deleted instances from status
577 match cleanup::cleanup_deleted_instances(&client, &dnszone, &mut status_updater).await {
578 Ok(deleted_count) if deleted_count > 0 => {
579 info!(
580 "Cleaned up {} deleted instance(s) from zone {}/{} status",
581 deleted_count, namespace, name
582 );
583 }
584 Ok(_) => {
585 debug!(
586 "No deleted instances found for zone {}/{} status",
587 namespace, name
588 );
589 }
590 Err(e) => {
591 warn!(
592 "Failed to cleanup deleted instances for zone {}/{}: {} (continuing with reconciliation)",
593 namespace, name, e
594 );
595 // Don't fail reconciliation for cleanup errors
596 }
597 }
598
599 // CRITICAL: We CANNOT skip reconciliation entirely, even if spec and instances haven't changed.
600 // Reconciliation may be triggered by ARecord/AAAA/TXT/etc changes via watches, and we MUST
601 // run record discovery to tag newly created records with status.zoneRef.
602 //
603 // However, we CAN skip BIND9 configuration if nothing changed (handled later in the flow).
604 // This ensures record discovery ALWAYS runs while still optimizing BIND9 API calls.
605
606 if instances_changed {
607 info!(
608 "Instances changed for zone {}/{} - reconciling to configure new instances",
609 namespace, name
610 );
611 }
612
613 info!(
614 "Reconciling zone {} (first_reconciliation={}, spec_changed={})",
615 spec.zone_name, first_reconciliation, spec_changed
616 );
617
618 // Cleanup stale records from status.records[] before main reconciliation
619 // This ensures status stays in sync with actual Kubernetes resources
620 match cleanup::cleanup_stale_records(
621 &client,
622 &dnszone,
623 &mut status_updater,
624 bind9_instances_store,
625 )
626 .await
627 {
628 Ok(stale_count) if stale_count > 0 => {
629 info!(
630 "Cleaned up {} stale record(s) from zone {}/{} status",
631 stale_count, namespace, name
632 );
633 }
634 Ok(_) => {
635 debug!(
636 "No stale records found in zone {}/{} status",
637 namespace, name
638 );
639 }
640 Err(e) => {
641 warn!(
642 "Failed to cleanup stale records for zone {}/{}: {} (continuing with reconciliation)",
643 namespace, name, e
644 );
645 // Don't fail reconciliation for cleanup errors
646 }
647 }
648
649 // BIND9 configuration: Always ensure zones exist on all instances
650 // This implements true declarative reconciliation - if a pod restarts without
651 // persistent storage, the reconciler will detect the missing zone and recreate it.
652 // The add_zones() function is idempotent, so this is safe to call every reconciliation.
653 //
654 // NOTE: We ALWAYS configure zones, not just when spec changes. This ensures:
655 // - Zones are recreated if pods restart without persistent volumes
656 // - New instances added to the cluster get zones automatically
657 // - Drift detection: if someone manually deletes a zone, it's recreated
658 // - True Kubernetes declarative reconciliation: actual state continuously matches desired state
659 let (primary_count, secondary_count) = bind9_config::configure_zone_on_instances(
660 ctx.clone(),
661 &dnszone,
662 zone_manager,
663 &mut status_updater,
664 &instance_refs,
665 &unreconciled_instances,
666 )
667 .await?;
668
669 // Discover DNS records and update status
670 let (record_refs, records_count) =
671 discovery::discover_and_update_records(&client, &dnszone, &mut status_updater).await?;
672
673 // Check if all discovered records are ready and trigger zone transfers if needed
674 if records_count > 0 {
675 let all_records_ready =
676 discovery::check_all_records_ready(&client, &namespace, &record_refs).await?;
677
678 if all_records_ready {
679 info!(
680 "All {} record(s) for zone {} are ready, triggering zone transfers to secondaries",
681 records_count, spec.zone_name
682 );
683
684 // Trigger zone transfers to all secondaries
685 // Zone transfers are triggered automatically by BIND9 via NOTIFY messages
686 // No manual trigger needed in the new architecture
687 info!(
688 "Zone {} configured on instances - BIND9 will handle zone transfers via NOTIFY",
689 spec.zone_name
690 );
691 } else {
692 info!("Not all records for zone {} are ready yet", spec.zone_name);
693 }
694 }
695 // Calculate expected counts and finalize status
696 let (expected_primary_count, expected_secondary_count) =
697 status_helpers::calculate_expected_instance_counts(&client, &instance_refs).await?;
698
699 status_helpers::finalize_zone_status(
700 &mut status_updater,
701 &client,
702 &spec.zone_name,
703 &namespace,
704 &name,
705 primary_count,
706 secondary_count,
707 expected_primary_count,
708 expected_secondary_count,
709 records_count,
710 dnszone.metadata.generation,
711 )
712 .await?;
713
714 // Trigger record reconciliation: Update all matching records with a "zone-reconciled" annotation
715 // This ensures records are re-added to BIND9 after pod restarts or zone recreation
716 if !status_updater.has_degraded_condition() {
717 if let Err(e) =
718 discovery::trigger_record_reconciliation(&client, &namespace, &spec.zone_name).await
719 {
720 warn!(
721 "Failed to trigger record reconciliation for zone {}: {}",
722 spec.zone_name, e
723 );
724 // Don't fail the entire reconciliation for this - records will eventually reconcile
725 }
726 }
727
728 Ok(())
729}
730
731/// Adds a DNS zone to all primary instances.
732///
733/// # Arguments
734///
735/// * `client` - Kubernetes API client
736/// * `dnszone` - The `DNSZone` resource
737/// * `zone_manager` - BIND9 manager for adding zone
738///
739/// # Returns
740///
741/// * `Ok(usize)` - Number of primary endpoints successfully configured
742/// * `Err(_)` - If zone addition failed
743///
744/// # Errors
745///
746/// Returns an error if BIND9 zone addition fails or if no instances are assigned.
747///
748/// # Panics
749///
750/// Panics if the RNDC key is not loaded by the helper function (should never happen in practice).
751#[allow(clippy::too_many_lines)]
752pub async fn add_dnszone(
753 ctx: Arc<crate::context::Context>,
754 dnszone: DNSZone,
755 zone_manager: &crate::bind9::Bind9Manager,
756 status_updater: &mut crate::reconcilers::status::DNSZoneStatusUpdater,
757 instance_refs: &[crate::crd::InstanceReference],
758) -> Result<usize> {
759 let client = ctx.client.clone();
760 let namespace = dnszone.namespace().unwrap_or_default();
761 let name = dnszone.name_any();
762 let spec = &dnszone.spec;
763
764 info!("Adding DNSZone {}/{}", namespace, name);
765
766 // PHASE 2 OPTIMIZATION: Use the filtered instance list passed by the caller
767 // This ensures we only process instances that need reconciliation (lastReconciledAt == None)
768
769 info!(
770 "DNSZone {}/{} will be added to {} instance(s): {:?}",
771 namespace,
772 name,
773 instance_refs.len(),
774 instance_refs
775 .iter()
776 .map(|i| format!("{}/{}", i.namespace, i.name))
777 .collect::<Vec<_>>()
778 );
779
780 // Filter to only PRIMARY instances
781 let primary_instance_refs = primary::filter_primary_instances(&client, instance_refs).await?;
782
783 if primary_instance_refs.is_empty() {
784 return Err(anyhow!(
785 "DNSZone {}/{} has no PRIMARY instances assigned. Instances: {:?}",
786 namespace,
787 name,
788 instance_refs
789 .iter()
790 .map(|i| format!("{}/{}", i.namespace, i.name))
791 .collect::<Vec<_>>()
792 ));
793 }
794
795 info!(
796 "Found {} PRIMARY instance(s) for DNSZone {}/{}",
797 primary_instance_refs.len(),
798 namespace,
799 name
800 );
801
802 // Find all secondary instances for zone transfer configuration
803 let secondary_instance_refs =
804 secondary::filter_secondary_instances(&client, instance_refs).await?;
805 let secondary_ips =
806 secondary::find_secondary_pod_ips_from_instances(&client, &secondary_instance_refs).await?;
807
808 if secondary_ips.is_empty() {
809 warn!(
810 "No secondary servers found for DNSZone {}/{} - zone transfers will not be configured",
811 namespace, name
812 );
813 } else {
814 info!(
815 "Found {} secondary server(s) for DNSZone {}/{} - zone transfers will be configured: {:?}",
816 secondary_ips.len(),
817 namespace,
818 name,
819 secondary_ips
820 );
821 }
822
823 // Get effective nameservers (supports both new `nameServers` and deprecated `nameServerIps`)
824 let effective_name_servers = get_effective_name_servers(spec);
825
826 // Generate legacy nameserver IPs format for backward compatibility with bindcar API
827 // If user didn't provide either field, auto-generate from instance IPs
828 let name_server_ips = if effective_name_servers.is_none() {
829 info!(
830 "DNSZone {}/{} has no explicit nameServers - auto-generating from {} instance(s)",
831 namespace,
832 name,
833 instance_refs.len()
834 );
835
836 // Build ordered list: primaries first, then secondaries
837 let mut ordered_instances = primary_instance_refs.clone();
838 ordered_instances.extend(secondary_instance_refs.clone());
839
840 match generate_nameserver_ips(&client, &spec.zone_name, &ordered_instances).await {
841 Ok(Some(generated_ips)) => {
842 info!(
843 "Auto-generated {} nameserver(s) for DNSZone {}/{}: {:?}",
844 generated_ips.len(),
845 namespace,
846 name,
847 generated_ips
848 );
849 Some(generated_ips)
850 }
851 Ok(None) => {
852 warn!(
853 "Failed to auto-generate nameserver IPs for DNSZone {}/{} - no IPs available",
854 namespace, name
855 );
856 None
857 }
858 Err(e) => {
859 warn!(
860 "Error auto-generating nameserver IPs for DNSZone {}/{}: {}",
861 namespace, name, e
862 );
863 None
864 }
865 }
866 } else {
867 // Convert effective_name_servers to HashMap<String, String> for bindcar API compatibility
868 // Only include IPv4 addresses (bindcar doesn't support IPv6 glue records in this field)
869 // SAFETY: We know effective_name_servers is Some because we're in the else block
870 let name_server_map: HashMap<String, String> =
871 if let Some(ref ns_list) = effective_name_servers {
872 ns_list
873 .iter()
874 .filter_map(|ns| {
875 ns.ipv4_address
876 .as_ref()
877 .map(|ip| (ns.hostname.clone(), ip.clone()))
878 })
879 .collect()
880 } else {
881 HashMap::new()
882 };
883
884 info!(
885 "Using explicit nameServers for DNSZone {}/{} ({} with IPv4 glue records)",
886 namespace,
887 name,
888 name_server_map.len()
889 );
890
891 if name_server_map.is_empty() {
892 None
893 } else {
894 Some(name_server_map)
895 }
896 };
897
898 // Extract list of ALL nameserver hostnames (primary from SOA + all from nameServers field)
899 // This is used by bindcar to generate NS records in the zone file
900 let all_nameserver_hostnames: Vec<String> = {
901 let mut hostnames = vec![spec.soa_record.primary_ns.clone()];
902
903 if let Some(ref ns_list) = effective_name_servers {
904 for ns in ns_list {
905 // Avoid duplicates - don't add primary NS again if it's in the list
906 if ns.hostname != spec.soa_record.primary_ns {
907 hostnames.push(ns.hostname.clone());
908 }
909 }
910 }
911
912 hostnames
913 };
914
915 info!(
916 "Zone {}/{} will be configured with {} nameserver(s): {:?}",
917 namespace,
918 name,
919 all_nameserver_hostnames.len(),
920 all_nameserver_hostnames
921 );
922
923 // Extract DNSSEC policy if configured
924 let dnssec_policy = spec.dnssec_policy.as_deref();
925 if let Some(policy) = dnssec_policy {
926 info!(
927 "DNSSEC policy '{}' will be applied to zone {}/{}",
928 policy, namespace, name
929 );
930 }
931
932 // Process all primary instances concurrently using async streams
933 // Mark each instance as reconciled immediately after first successful endpoint configuration
934 let first_endpoint = Arc::new(Mutex::new(None::<String>));
935 let total_endpoints = Arc::new(Mutex::new(0_usize));
936 let errors = Arc::new(Mutex::new(Vec::<String>::new()));
937 let status_updater_shared = Arc::new(Mutex::new(status_updater));
938
939 // Create a stream of futures for all instances
940 let _instance_results = stream::iter(primary_instance_refs.iter())
941 .then(|instance_ref| {
942 let client = client.clone();
943 let zone_manager = zone_manager.clone();
944 let zone_name = spec.zone_name.clone();
945 let soa_record = spec.soa_record.clone();
946 let all_nameserver_hostnames = all_nameserver_hostnames.clone();
947 let name_server_ips = name_server_ips.clone();
948 let secondary_ips = secondary_ips.clone();
949 let first_endpoint = Arc::clone(&first_endpoint);
950 let total_endpoints = Arc::clone(&total_endpoints);
951 let errors = Arc::clone(&errors);
952 let status_updater_shared = Arc::clone(&status_updater_shared);
953 let instance_ref = instance_ref.clone();
954 let _zone_namespace = namespace.clone();
955 let _zone_name_ref = name.clone();
956
957 async move {
958 info!(
959 "Processing endpoints for primary instance {}/{}",
960 instance_ref.namespace, instance_ref.name
961 );
962
963 // Load RNDC key for this specific instance
964 let key_data = match helpers::load_rndc_key(&client, &instance_ref.namespace, &instance_ref.name).await {
965 Ok(key) => key,
966 Err(e) => {
967 let err_msg = format!("instance {}/{}: failed to load RNDC key: {e}", instance_ref.namespace, instance_ref.name);
968 errors.lock().await.push(err_msg);
969 return;
970 }
971 };
972
973 // Get all endpoints for this instance
974 let endpoints = match helpers::get_endpoint(&client, &instance_ref.namespace, &instance_ref.name, "http").await {
975 Ok(eps) => eps,
976 Err(e) => {
977 let err_msg = format!("instance {}/{}: failed to get endpoints: {e}", instance_ref.namespace, instance_ref.name);
978 errors.lock().await.push(err_msg);
979 return;
980 }
981 };
982
983 info!(
984 "Found {} endpoint(s) for primary instance {}/{}",
985 endpoints.len(),
986 instance_ref.namespace,
987 instance_ref.name
988 );
989
990 // Process endpoints concurrently for this instance
991 let endpoint_results = stream::iter(endpoints.iter())
992 .then(|endpoint| {
993 let zone_manager = zone_manager.clone();
994 let zone_name = zone_name.clone();
995 let key_data = key_data.clone();
996 let soa_record = soa_record.clone();
997 let all_nameserver_hostnames = all_nameserver_hostnames.clone();
998 let name_server_ips = name_server_ips.clone();
999 let secondary_ips = secondary_ips.clone();
1000 let first_endpoint = Arc::clone(&first_endpoint);
1001 let total_endpoints = Arc::clone(&total_endpoints);
1002 let errors = Arc::clone(&errors);
1003 let instance_ref = instance_ref.clone();
1004 let endpoint = endpoint.clone();
1005
1006 async move {
1007 let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1008
1009 // Save the first endpoint (globally)
1010 {
1011 let mut first = first_endpoint.lock().await;
1012 if first.is_none() {
1013 *first = Some(pod_endpoint.clone());
1014 }
1015 }
1016
1017 // Check if zone already exists before attempting creation
1018 let zone_exists = match zone_manager.zone_exists(&zone_name, &pod_endpoint).await {
1019 Ok(exists) => exists,
1020 Err(e) => {
1021 error!(
1022 "Failed to check if zone {} exists on endpoint {} (instance {}/{}): {}",
1023 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1024 );
1025 // Treat errors as "zone might not exist" - proceed with add_zones
1026 false
1027 }
1028 };
1029
1030 if zone_exists {
1031 debug!(
1032 "Zone {} already exists on endpoint {} (instance {}/{}), skipping creation",
1033 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1034 );
1035 *total_endpoints.lock().await += 1;
1036 // Return false to indicate zone was not newly added
1037 return Ok(false);
1038 }
1039
1040 // Pass secondary IPs for zone transfer configuration
1041 let secondary_ips_ref = if secondary_ips.is_empty() {
1042 None
1043 } else {
1044 Some(secondary_ips.as_slice())
1045 };
1046
1047 match zone_manager
1048 .add_zones(
1049 &zone_name,
1050 ZONE_TYPE_PRIMARY,
1051 &pod_endpoint,
1052 &key_data,
1053 Some(&soa_record),
1054 Some(&all_nameserver_hostnames),
1055 name_server_ips.as_ref(),
1056 secondary_ips_ref,
1057 None, // primary_ips only for secondary zones
1058 dnssec_policy,
1059 )
1060 .await
1061 {
1062 Ok(was_added) => {
1063 if was_added {
1064 info!(
1065 "Successfully added zone {} to endpoint {} (instance: {}/{})",
1066 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1067 );
1068 }
1069 *total_endpoints.lock().await += 1;
1070 // Return was_added so we can check if zone was actually configured
1071 Ok(was_added)
1072 }
1073 Err(e) => {
1074 error!(
1075 "Failed to add zone {} to endpoint {} (instance {}/{}): {}",
1076 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1077 );
1078 errors.lock().await.push(format!(
1079 "endpoint {pod_endpoint} (instance {}/{}): {e}",
1080 instance_ref.namespace, instance_ref.name
1081 ));
1082 Err(())
1083 }
1084 }
1085 }
1086 })
1087 .collect::<Vec<Result<bool, ()>>>()
1088 .await;
1089
1090 // Mark this instance as configured ONLY if at least one endpoint actually added the zone
1091 // This prevents updating lastReconciledAt when zone already exists (avoids tight loop)
1092 let zone_was_configured = endpoint_results.iter().any(|r| r.is_ok() && *r.as_ref().unwrap());
1093 if zone_was_configured {
1094 status_updater_shared
1095 .lock()
1096 .await
1097 .update_instance_status(
1098 &instance_ref.name,
1099 &instance_ref.namespace,
1100 crate::crd::InstanceStatus::Configured,
1101 Some("Zone successfully configured on primary instance".to_string()),
1102 );
1103 info!(
1104 "Marked primary instance {}/{} as configured for zone {}",
1105 instance_ref.namespace, instance_ref.name, zone_name
1106 );
1107
1108 // PHASE 2 COMPLETION: Update Bind9Instance.status.selectedZones[].lastReconciledAt
1109 // This signals successful zone configuration and prevents infinite reconciliation loops
1110 // STUB: No longer needed - function is a no-op
1111 // update_zone_reconciled_timestamp(
1112 // &client,
1113 // &instance_ref.name,
1114 // &instance_ref.namespace,
1115 // &zone_name_ref,
1116 // &zone_namespace,
1117 // );
1118 }
1119 }
1120 })
1121 .collect::<Vec<()>>()
1122 .await;
1123
1124 let first_endpoint = Arc::try_unwrap(first_endpoint)
1125 .expect("Failed to unwrap first_endpoint Arc")
1126 .into_inner();
1127 let total_endpoints = Arc::try_unwrap(total_endpoints)
1128 .expect("Failed to unwrap total_endpoints Arc")
1129 .into_inner();
1130 let errors = Arc::try_unwrap(errors)
1131 .expect("Failed to unwrap errors Arc")
1132 .into_inner();
1133 let _status_updater = Arc::try_unwrap(status_updater_shared)
1134 .map_err(|_| anyhow!("Failed to unwrap status_updater - multiple references remain"))?
1135 .into_inner();
1136
1137 // If ALL operations failed, return an error
1138 if total_endpoints == 0 && !errors.is_empty() {
1139 return Err(anyhow!(
1140 "Failed to add zone {} to all primary instances. Errors: {}",
1141 spec.zone_name,
1142 errors.join("; ")
1143 ));
1144 }
1145
1146 info!(
1147 "Successfully added zone {} to {} endpoint(s) across {} primary instance(s)",
1148 spec.zone_name,
1149 total_endpoints,
1150 primary_instance_refs.len()
1151 );
1152
1153 // Auto-generate NS records and glue records from nameServers field
1154 if let Some(ref name_servers) = effective_name_servers {
1155 if !name_servers.is_empty() {
1156 info!(
1157 "Auto-generating NS records for {} nameserver(s) in zone {}",
1158 name_servers.len(),
1159 spec.zone_name
1160 );
1161
1162 if let Err(e) = auto_generate_ns_records(
1163 &client,
1164 name_servers,
1165 &spec.zone_name,
1166 spec.ttl,
1167 &primary_instance_refs,
1168 )
1169 .await
1170 {
1171 warn!(
1172 "Failed to auto-generate some NS records for zone {}: {}. \
1173 Zone is functional but may have incomplete NS records.",
1174 spec.zone_name, e
1175 );
1176 // Don't fail reconciliation - zone is functional even without all NS records
1177 }
1178 }
1179 }
1180
1181 // Note: We don't need to reload after addzone because:
1182 // 1. rndc addzone immediately adds the zone to BIND9's running config
1183 // 2. The zone file will be created automatically when records are added via dynamic updates
1184 // 3. Reloading would fail if the zone file doesn't exist yet
1185
1186 // Notify secondaries about the new zone via the first endpoint
1187 // This triggers zone transfer (AXFR) from primary to secondaries
1188 if let Some(first_pod_endpoint) = first_endpoint {
1189 info!("Notifying secondaries about new zone {}", spec.zone_name);
1190 if let Err(e) = zone_manager
1191 .notify_zone(&spec.zone_name, &first_pod_endpoint)
1192 .await
1193 {
1194 // Don't fail if NOTIFY fails - the zone was successfully created
1195 // Secondaries will sync via SOA refresh timer
1196 warn!(
1197 "Failed to notify secondaries for zone {}: {}. Secondaries will sync via SOA refresh timer.",
1198 spec.zone_name, e
1199 );
1200 }
1201 } else {
1202 warn!(
1203 "No endpoints found for zone {}, cannot notify secondaries",
1204 spec.zone_name
1205 );
1206 }
1207
1208 Ok(total_endpoints)
1209}
1210
1211/// Adds a DNS zone to all secondary instances in the cluster with primaries configured.
1212///
1213/// Creates secondary zones on all secondary instances, configuring them to transfer
1214/// from the provided primary server IPs. If a zone already exists on a secondary,
1215/// it checks if the primaries list matches and updates it if necessary.
1216///
1217/// # Arguments
1218///
1219/// * `client` - Kubernetes API client
1220/// * `dnszone` - The `DNSZone` resource
1221/// * `zone_manager` - BIND9 manager for adding zone
1222/// * `primary_ips` - List of primary server IPs to configure in the primaries field
1223///
1224/// # Returns
1225///
1226/// * `Ok(usize)` - Number of secondary endpoints successfully configured
1227/// * `Err(_)` - If zone addition failed
1228///
1229/// # Errors
1230///
1231/// Returns an error if BIND9 zone addition fails on any secondary instance.
1232///
1233/// # Panics
1234///
1235/// Panics if internal Arc unwrapping fails (should not happen in normal operation).
1236#[allow(clippy::too_many_lines)]
1237pub async fn add_dnszone_to_secondaries(
1238 ctx: Arc<crate::context::Context>,
1239 dnszone: DNSZone,
1240 zone_manager: &crate::bind9::Bind9Manager,
1241 primary_ips: &[String],
1242 status_updater: &mut crate::reconcilers::status::DNSZoneStatusUpdater,
1243 instance_refs: &[crate::crd::InstanceReference],
1244) -> Result<usize> {
1245 let client = ctx.client.clone();
1246 let namespace = dnszone.namespace().unwrap_or_default();
1247 let name = dnszone.name_any();
1248 let spec = &dnszone.spec;
1249
1250 if primary_ips.is_empty() {
1251 warn!(
1252 "No primary IPs provided for secondary zone {}/{} - skipping secondary configuration",
1253 namespace, spec.zone_name
1254 );
1255 return Ok(0);
1256 }
1257
1258 info!(
1259 "Adding DNSZone {}/{} to secondary instances with primaries: {:?}",
1260 namespace, name, primary_ips
1261 );
1262
1263 // PHASE 2 OPTIMIZATION: Use the filtered instance list passed by the caller
1264 // This ensures we only process instances that need reconciliation (lastReconciledAt == None)
1265
1266 // Filter to only SECONDARY instances
1267 let secondary_instance_refs =
1268 secondary::filter_secondary_instances(&client, instance_refs).await?;
1269
1270 if secondary_instance_refs.is_empty() {
1271 info!(
1272 "No secondary instances found for DNSZone {}/{} - skipping secondary zone configuration",
1273 namespace, name
1274 );
1275 return Ok(0);
1276 }
1277
1278 info!(
1279 "Found {} secondary instance(s) for DNSZone {}/{}",
1280 secondary_instance_refs.len(),
1281 namespace,
1282 name
1283 );
1284
1285 // Process all secondary instances concurrently using async streams
1286 // Mark each instance as reconciled immediately after first successful endpoint configuration
1287 let total_endpoints = Arc::new(Mutex::new(0_usize));
1288 let errors = Arc::new(Mutex::new(Vec::<String>::new()));
1289 let status_updater_shared = Arc::new(Mutex::new(status_updater));
1290
1291 // Create a stream of futures for all secondary instances
1292 let _instance_results = stream::iter(secondary_instance_refs.iter())
1293 .then(|instance_ref| {
1294 let client = client.clone();
1295 let zone_manager = zone_manager.clone();
1296 let zone_name = spec.zone_name.clone();
1297 let primary_ips = primary_ips.to_vec();
1298 let total_endpoints = Arc::clone(&total_endpoints);
1299 let errors = Arc::clone(&errors);
1300 let status_updater_shared = Arc::clone(&status_updater_shared);
1301 let instance_ref = instance_ref.clone();
1302 let _zone_namespace = namespace.clone();
1303 let _zone_name_ref = name.clone();
1304
1305 async move {
1306 info!(
1307 "Processing secondary instance {}/{} for zone {}",
1308 instance_ref.namespace, instance_ref.name, zone_name
1309 );
1310
1311 // Load RNDC key for this specific instance
1312 // Each instance has its own RNDC secret for security isolation
1313 let key_data = match helpers::load_rndc_key(&client, &instance_ref.namespace, &instance_ref.name).await {
1314 Ok(key) => key,
1315 Err(e) => {
1316 let err_msg = format!("instance {}/{}: failed to load RNDC key: {e}", instance_ref.namespace, instance_ref.name);
1317 errors.lock().await.push(err_msg);
1318 return;
1319 }
1320 };
1321
1322 // Get all endpoints for this secondary instance
1323 let endpoints = match helpers::get_endpoint(&client, &instance_ref.namespace, &instance_ref.name, "http").await {
1324 Ok(eps) => eps,
1325 Err(e) => {
1326 let err_msg = format!("instance {}/{}: failed to get endpoints: {e}", instance_ref.namespace, instance_ref.name);
1327 errors.lock().await.push(err_msg);
1328 return;
1329 }
1330 };
1331
1332 info!(
1333 "Found {} endpoint(s) for secondary instance {}/{}",
1334 endpoints.len(),
1335 instance_ref.namespace,
1336 instance_ref.name
1337 );
1338
1339 // Process endpoints concurrently for this instance
1340 let endpoint_results = stream::iter(endpoints.iter())
1341 .then(|endpoint| {
1342 let zone_manager = zone_manager.clone();
1343 let zone_name = zone_name.clone();
1344 let key_data = key_data.clone();
1345 let primary_ips = primary_ips.clone();
1346 let total_endpoints = Arc::clone(&total_endpoints);
1347 let errors = Arc::clone(&errors);
1348 let instance_ref = instance_ref.clone();
1349 let endpoint = endpoint.clone();
1350
1351 async move {
1352 let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1353
1354 // Check if zone already exists before attempting creation
1355 let zone_exists = match zone_manager.zone_exists(&zone_name, &pod_endpoint).await {
1356 Ok(exists) => exists,
1357 Err(e) => {
1358 error!(
1359 "Failed to check if zone {} exists on endpoint {} (instance {}/{}): {}",
1360 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1361 );
1362 // Treat errors as "zone might not exist" - proceed with add_zones
1363 false
1364 }
1365 };
1366
1367 // Variable to track if zone was added
1368 let was_added = if zone_exists {
1369 debug!(
1370 "Secondary zone {} already exists on endpoint {} (instance {}/{}), skipping creation",
1371 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1372 );
1373 *total_endpoints.lock().await += 1;
1374 false // Zone not newly added
1375 } else {
1376 info!(
1377 "Adding secondary zone {} to endpoint {} (instance: {}/{}) with primaries: {:?}",
1378 zone_name,
1379 pod_endpoint,
1380 instance_ref.namespace,
1381 instance_ref.name,
1382 primary_ips
1383 );
1384
1385 match zone_manager
1386 .add_zones(
1387 &zone_name,
1388 ZONE_TYPE_SECONDARY,
1389 &pod_endpoint,
1390 &key_data,
1391 None, // No SOA record for secondary zones
1392 None, // No name_servers for secondary zones
1393 None, // No name_server_ips for secondary zones
1394 None, // No secondary_ips for secondary zones
1395 Some(&primary_ips),
1396 None, // No DNSSEC policy for secondary zones
1397 )
1398 .await
1399 {
1400 Ok(added) => {
1401 if added {
1402 info!(
1403 "Successfully added secondary zone {} to endpoint {} (instance: {}/{})",
1404 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1405 );
1406 } else {
1407 info!(
1408 "Secondary zone {} already exists on endpoint {} (instance: {}/{})",
1409 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1410 );
1411 }
1412 *total_endpoints.lock().await += 1;
1413 added
1414 }
1415 Err(e) => {
1416 error!(
1417 "Failed to add secondary zone {} to endpoint {} (instance {}/{}): {}",
1418 zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1419 );
1420 errors.lock().await.push(format!(
1421 "endpoint {pod_endpoint} (instance {}/{}): {e}",
1422 instance_ref.namespace, instance_ref.name
1423 ));
1424 return Err(());
1425 }
1426 }
1427 };
1428
1429 // CRITICAL: Immediately trigger zone transfer to load the zone data
1430 // This is necessary because:
1431 // 1. `rndc addzone` only adds the zone to BIND9's config (in-memory)
1432 // 2. The zone file doesn't exist yet on the secondary
1433 // 3. Queries will return SERVFAIL until data is transferred from primary
1434 // 4. `rndc retransfer` forces an immediate AXFR from primary to secondary
1435 //
1436 // This ensures the zone is LOADED and SERVING queries immediately after
1437 // secondary pod restart or zone creation.
1438 // NOTE: We trigger transfer even if zone already existed to ensure it's up to date
1439 info!(
1440 "Triggering immediate zone transfer for {} on secondary {} to load zone data",
1441 zone_name, pod_endpoint
1442 );
1443 if let Err(e) = zone_manager
1444 .retransfer_zone(&zone_name, &pod_endpoint)
1445 .await
1446 {
1447 // Don't fail reconciliation if retransfer fails - zone will sync via SOA refresh
1448 warn!(
1449 "Failed to trigger immediate zone transfer for {} on {}: {}. Zone will sync via SOA refresh timer.",
1450 zone_name, pod_endpoint, e
1451 );
1452 } else {
1453 info!(
1454 "Successfully triggered zone transfer for {} on {}",
1455 zone_name, pod_endpoint
1456 );
1457 }
1458
1459 // Return was_added so we can check if zone was actually configured
1460 Ok(was_added)
1461 }
1462 })
1463 .collect::<Vec<Result<bool, ()>>>()
1464 .await;
1465
1466 // Mark this instance as configured ONLY if at least one endpoint actually added the zone
1467 // This prevents updating lastReconciledAt when zone already exists (avoids tight loop)
1468 let zone_was_configured = endpoint_results.iter().any(|r| r.is_ok() && *r.as_ref().unwrap());
1469 if zone_was_configured {
1470 status_updater_shared
1471 .lock()
1472 .await
1473 .update_instance_status(
1474 &instance_ref.name,
1475 &instance_ref.namespace,
1476 crate::crd::InstanceStatus::Configured,
1477 Some("Zone successfully configured on secondary instance".to_string()),
1478 );
1479 info!(
1480 "Marked secondary instance {}/{} as configured for zone {}",
1481 instance_ref.namespace, instance_ref.name, zone_name
1482 );
1483
1484 // PHASE 2 COMPLETION: Update Bind9Instance.status.selectedZones[].lastReconciledAt
1485 // This signals successful zone configuration and prevents infinite reconciliation loops
1486 // STUB: No longer needed - function is a no-op
1487 // update_zone_reconciled_timestamp(
1488 // &client,
1489 // &instance_ref.name,
1490 // &instance_ref.namespace,
1491 // &zone_name_ref,
1492 // &zone_namespace,
1493 // );
1494 }
1495 }
1496 })
1497 .collect::<Vec<()>>()
1498 .await;
1499
1500 let total_endpoints = Arc::try_unwrap(total_endpoints).unwrap().into_inner();
1501 let errors = Arc::try_unwrap(errors).unwrap().into_inner();
1502
1503 // If ALL operations failed, return an error
1504 if total_endpoints == 0 && !errors.is_empty() {
1505 return Err(anyhow!(
1506 "Failed to add zone {} to all secondary instances. Errors: {}",
1507 spec.zone_name,
1508 errors.join("; ")
1509 ));
1510 }
1511
1512 info!(
1513 "Successfully configured secondary zone {} on {} endpoint(s) across {} secondary instance(s)",
1514 spec.zone_name,
1515 total_endpoints,
1516 secondary_instance_refs.len()
1517 );
1518
1519 Ok(total_endpoints)
1520}
1521
1522/// Deletes a DNS zone and its associated zone files.
1523///
1524/// # Arguments
1525///
1526/// * `_client` - Kubernetes API client (unused, for future extensions)
1527/// * `dnszone` - The `DNSZone` resource to delete
1528/// * `zone_manager` - BIND9 manager for removing zone files
1529///
1530/// # Returns
1531///
1532/// * `Ok(())` - If zone was deleted successfully
1533/// * `Err(_)` - If zone deletion failed
1534///
1535/// # Errors
1536///
1537/// Returns an error if BIND9 zone deletion fails.
1538pub async fn delete_dnszone(
1539 ctx: Arc<crate::context::Context>,
1540 dnszone: DNSZone,
1541 zone_manager: &crate::bind9::Bind9Manager,
1542) -> Result<()> {
1543 let client = ctx.client.clone();
1544 let bind9_instances_store = &ctx.stores.bind9_instances;
1545 let namespace = dnszone.namespace().unwrap_or_default();
1546 let name = dnszone.name_any();
1547 let spec = &dnszone.spec;
1548
1549 info!("Deleting DNSZone {}/{}", namespace, name);
1550
1551 // Get instances from new architecture (spec.bind9Instances or status.bind9Instances)
1552 // If zone has no instances assigned (e.g., orphaned zone), still allow deletion
1553 let instance_refs = match validation::get_instances_from_zone(&dnszone, bind9_instances_store) {
1554 Ok(refs) => refs,
1555 Err(e) => {
1556 warn!(
1557 "DNSZone {}/{} has no instances assigned: {}. Allowing deletion anyway.",
1558 namespace, name, e
1559 );
1560 return Ok(());
1561 }
1562 };
1563
1564 // Filter to primary and secondary instances
1565 let primary_instance_refs = primary::filter_primary_instances(&client, &instance_refs).await?;
1566 let secondary_instance_refs =
1567 secondary::filter_secondary_instances(&client, &instance_refs).await?;
1568
1569 // Delete from all primary instances
1570 if !primary_instance_refs.is_empty() {
1571 let (_first_endpoint, total_endpoints) = helpers::for_each_instance_endpoint(
1572 &client,
1573 &primary_instance_refs,
1574 false, // with_rndc_key = false for zone deletion
1575 "http", // Use HTTP API port for zone deletion via bindcar API
1576 |pod_endpoint, instance_name, _rndc_key| {
1577 let zone_name = spec.zone_name.clone();
1578 let zone_manager = zone_manager.clone();
1579
1580 async move {
1581 info!(
1582 "Deleting zone {} from endpoint {} (instance: {})",
1583 zone_name, pod_endpoint, instance_name
1584 );
1585
1586 // Attempt to delete zone - if it fails (zone not found, endpoint unreachable, etc.),
1587 // log a warning but don't fail the deletion. This ensures DNSZones can be deleted
1588 // even if BIND9 instances are unavailable or the zone was already removed.
1589 // Pass freeze_before_delete=true for primary zones to prevent updates during deletion
1590 if let Err(e) = zone_manager.delete_zone(&zone_name, &pod_endpoint, true).await {
1591 warn!(
1592 "Failed to delete zone {} from endpoint {} (instance: {}): {}. Continuing with deletion anyway.",
1593 zone_name, pod_endpoint, instance_name, e
1594 );
1595 } else {
1596 debug!(
1597 "Successfully deleted zone {} from endpoint {} (instance: {})",
1598 zone_name, pod_endpoint, instance_name
1599 );
1600 }
1601
1602 Ok(())
1603 }
1604 },
1605 )
1606 .await?;
1607
1608 info!(
1609 "Successfully deleted zone {} from {} primary endpoint(s)",
1610 spec.zone_name, total_endpoints
1611 );
1612 }
1613
1614 // Delete from all secondary instances
1615 if !secondary_instance_refs.is_empty() {
1616 let mut secondary_endpoints_deleted = 0;
1617
1618 for instance_ref in &secondary_instance_refs {
1619 let endpoints =
1620 helpers::get_endpoint(&client, &instance_ref.namespace, &instance_ref.name, "http")
1621 .await?;
1622
1623 for endpoint in &endpoints {
1624 let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1625
1626 info!(
1627 "Deleting zone {} from secondary endpoint {} (instance: {}/{})",
1628 spec.zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1629 );
1630
1631 // Attempt to delete zone - if it fails, log a warning but don't fail the deletion
1632 // Pass freeze_before_delete=false for secondary zones (they are read-only, no need to freeze)
1633 if let Err(e) = zone_manager
1634 .delete_zone(&spec.zone_name, &pod_endpoint, false)
1635 .await
1636 {
1637 warn!(
1638 "Failed to delete zone {} from secondary endpoint {} (instance: {}/{}): {}. Continuing with deletion anyway.",
1639 spec.zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1640 );
1641 } else {
1642 debug!(
1643 "Successfully deleted zone {} from secondary endpoint {} (instance: {}/{})",
1644 spec.zone_name, pod_endpoint, instance_ref.namespace, instance_ref.name
1645 );
1646 secondary_endpoints_deleted += 1;
1647 }
1648 }
1649 }
1650
1651 info!(
1652 "Successfully deleted zone {} from {} secondary endpoint(s)",
1653 spec.zone_name, secondary_endpoints_deleted
1654 );
1655 }
1656
1657 // Note: We don't need to reload after delzone because:
1658 // 1. rndc delzone immediately removes the zone from BIND9's running config
1659 // 2. BIND9 will clean up the zone file and journal files automatically
1660
1661 Ok(())
1662}
1663
1664/// Auto-generates NS records for all nameservers in the zone.
1665///
1666/// This function is called after zone creation to add NS records for secondary nameservers
1667/// specified in the `nameServers` field. The primary nameserver NS record is already created
1668/// by bindcar during zone initialization (from SOA).
1669///
1670/// # Arguments
1671/// * `client` - Kubernetes client for loading RNDC keys and getting endpoints
1672/// * `effective_name_servers` - List of nameservers from `nameServers` field
1673/// * `zone_name` - The DNS zone name
1674/// * `ttl` - TTL for the NS and glue records
1675/// * `primary_instance_refs` - List of primary instances to update
1676///
1677/// # Returns
1678/// Result indicating success or failure
1679///
1680/// # Errors
1681/// Returns error if NS record or glue record addition fails
1682#[allow(clippy::too_many_lines)]
1683async fn auto_generate_ns_records(
1684 client: &kube::Client,
1685 effective_name_servers: &[crate::crd::NameServer],
1686 zone_name: &str,
1687 ttl: Option<i32>,
1688 primary_instance_refs: &[crate::crd::InstanceReference],
1689) -> Result<()> {
1690 if effective_name_servers.is_empty() {
1691 return Ok(());
1692 }
1693
1694 info!(
1695 "Auto-generating {} NS record(s) for zone {}",
1696 effective_name_servers.len(),
1697 zone_name
1698 );
1699
1700 for nameserver in effective_name_servers {
1701 // Add NS record at zone apex (@)
1702 info!(
1703 "Adding NS record: {} IN NS {}",
1704 zone_name, nameserver.hostname
1705 );
1706
1707 for instance_ref in primary_instance_refs {
1708 // Load RNDC key for this instance
1709 let key_data = match helpers::load_rndc_key(
1710 client,
1711 &instance_ref.namespace,
1712 &instance_ref.name,
1713 )
1714 .await
1715 {
1716 Ok(key) => key,
1717 Err(e) => {
1718 warn!(
1719 "Failed to load RNDC key for instance {}/{}: {}. Skipping NS record addition.",
1720 instance_ref.namespace, instance_ref.name, e
1721 );
1722 continue;
1723 }
1724 };
1725
1726 // Get endpoints for this instance
1727 let endpoints = match helpers::get_endpoint(
1728 client,
1729 &instance_ref.namespace,
1730 &instance_ref.name,
1731 "dns-tcp",
1732 )
1733 .await
1734 {
1735 Ok(eps) => eps,
1736 Err(e) => {
1737 warn!(
1738 "Failed to get endpoints for instance {}/{}: {}. Skipping NS record addition.",
1739 instance_ref.namespace, instance_ref.name, e
1740 );
1741 continue;
1742 }
1743 };
1744
1745 // Add NS record to all endpoints of this instance
1746 for endpoint in &endpoints {
1747 let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1748
1749 if let Err(e) = crate::bind9::records::ns::add_ns_record(
1750 zone_name,
1751 "@", // Zone apex
1752 &nameserver.hostname,
1753 ttl,
1754 &pod_endpoint,
1755 &key_data,
1756 )
1757 .await
1758 {
1759 warn!(
1760 "Failed to add NS record for {} to endpoint {} (instance {}/{}): {}",
1761 nameserver.hostname,
1762 pod_endpoint,
1763 instance_ref.namespace,
1764 instance_ref.name,
1765 e
1766 );
1767 // Continue with other endpoints - partial success is acceptable
1768 }
1769 }
1770 }
1771
1772 // Add glue records if IPs provided (for in-zone nameservers)
1773 if let Some(ref ipv4) = nameserver.ipv4_address {
1774 add_glue_record(
1775 client,
1776 zone_name,
1777 &nameserver.hostname,
1778 ipv4,
1779 hickory_client::rr::RecordType::A,
1780 ttl,
1781 primary_instance_refs,
1782 )
1783 .await?;
1784 }
1785
1786 if let Some(ref ipv6) = nameserver.ipv6_address {
1787 add_glue_record(
1788 client,
1789 zone_name,
1790 &nameserver.hostname,
1791 ipv6,
1792 hickory_client::rr::RecordType::AAAA,
1793 ttl,
1794 primary_instance_refs,
1795 )
1796 .await?;
1797 }
1798 }
1799
1800 info!(
1801 "Successfully auto-generated NS records and glue records for zone {}",
1802 zone_name
1803 );
1804
1805 Ok(())
1806}
1807
1808/// Adds a glue record (A or AAAA) for an in-zone nameserver.
1809///
1810/// Glue records provide IP addresses for nameservers within the zone's own domain.
1811/// This is necessary to avoid circular dependencies when resolving the nameserver itself.
1812///
1813/// # Arguments
1814/// * `client` - Kubernetes client for loading RNDC keys and getting endpoints
1815/// * `zone_name` - The DNS zone name
1816/// * `hostname` - Full nameserver hostname (e.g., "ns2.example.com.")
1817/// * `ip_address` - IP address (IPv4 or IPv6)
1818/// * `record_type` - Type of glue record (A or AAAA)
1819/// * `ttl` - TTL for the glue record
1820/// * `primary_instance_refs` - List of primary instances to update
1821///
1822/// # Returns
1823/// Result indicating success or failure
1824///
1825/// # Errors
1826/// Returns error if glue record addition fails on all instances
1827#[allow(clippy::too_many_lines)]
1828async fn add_glue_record(
1829 client: &kube::Client,
1830 zone_name: &str,
1831 hostname: &str,
1832 ip_address: &str,
1833 record_type: hickory_client::rr::RecordType,
1834 ttl: Option<i32>,
1835 primary_instance_refs: &[crate::crd::InstanceReference],
1836) -> Result<()> {
1837 // Extract record name from hostname
1838 // Example: "ns2.example.com." in zone "example.com" → name = "ns2"
1839 let name = hostname
1840 .trim_end_matches('.')
1841 .strip_suffix(&format!(".{}", zone_name.trim_end_matches('.')))
1842 .unwrap_or_else(|| hostname.trim_end_matches('.'));
1843
1844 // Check if this is actually an in-zone nameserver
1845 if name == hostname.trim_end_matches('.') {
1846 // Hostname doesn't end with zone name - this is an out-of-zone nameserver
1847 // No glue record needed
1848 debug!(
1849 "Skipping glue record for out-of-zone nameserver {} (not in zone {})",
1850 hostname, zone_name
1851 );
1852 return Ok(());
1853 }
1854
1855 info!(
1856 "Adding {} glue record: {} IN {} {}",
1857 if record_type == hickory_client::rr::RecordType::A {
1858 "A"
1859 } else {
1860 "AAAA"
1861 },
1862 name,
1863 if record_type == hickory_client::rr::RecordType::A {
1864 "A"
1865 } else {
1866 "AAAA"
1867 },
1868 ip_address
1869 );
1870
1871 let mut success_count = 0;
1872 let mut errors = Vec::new();
1873
1874 for instance_ref in primary_instance_refs {
1875 // Load RNDC key for this instance
1876 let key_data = match helpers::load_rndc_key(
1877 client,
1878 &instance_ref.namespace,
1879 &instance_ref.name,
1880 )
1881 .await
1882 {
1883 Ok(key) => key,
1884 Err(e) => {
1885 warn!(
1886 "Failed to load RNDC key for instance {}/{}: {}. Skipping glue record addition.",
1887 instance_ref.namespace, instance_ref.name, e
1888 );
1889 continue;
1890 }
1891 };
1892
1893 // Get endpoints for this instance
1894 let endpoints = match helpers::get_endpoint(
1895 client,
1896 &instance_ref.namespace,
1897 &instance_ref.name,
1898 "dns-tcp",
1899 )
1900 .await
1901 {
1902 Ok(eps) => eps,
1903 Err(e) => {
1904 warn!(
1905 "Failed to get endpoints for instance {}/{}: {}. Skipping glue record addition.",
1906 instance_ref.namespace, instance_ref.name, e
1907 );
1908 continue;
1909 }
1910 };
1911
1912 // Add glue record to all endpoints of this instance
1913 for endpoint in &endpoints {
1914 let pod_endpoint = format!("{}:{}", endpoint.ip, endpoint.port);
1915
1916 let result = match record_type {
1917 hickory_client::rr::RecordType::A => {
1918 crate::bind9::records::a::add_a_record(
1919 zone_name,
1920 name,
1921 &[ip_address.to_string()],
1922 ttl,
1923 &pod_endpoint,
1924 &key_data,
1925 )
1926 .await
1927 }
1928 hickory_client::rr::RecordType::AAAA => {
1929 crate::bind9::records::a::add_aaaa_record(
1930 zone_name,
1931 name,
1932 &[ip_address.to_string()],
1933 ttl,
1934 &pod_endpoint,
1935 &key_data,
1936 )
1937 .await
1938 }
1939 _ => {
1940 return Err(anyhow::anyhow!(
1941 "Invalid record type for glue record: {:?}",
1942 record_type
1943 ))
1944 }
1945 };
1946
1947 match result {
1948 Ok(()) => {
1949 success_count += 1;
1950 }
1951 Err(e) => {
1952 warn!(
1953 "Failed to add glue record {} to endpoint {} (instance {}/{}): {}",
1954 name, pod_endpoint, instance_ref.namespace, instance_ref.name, e
1955 );
1956 errors.push(format!(
1957 "endpoint {} (instance {}/{}): {}",
1958 pod_endpoint, instance_ref.namespace, instance_ref.name, e
1959 ));
1960 }
1961 }
1962 }
1963 }
1964
1965 // Accept partial success - at least one endpoint updated
1966 if success_count > 0 {
1967 Ok(())
1968 } else {
1969 Err(anyhow::anyhow!(
1970 "Failed to add glue record {} to all instances. Errors: {}",
1971 name,
1972 errors.join("; ")
1973 ))
1974 }
1975}
1976
1977#[cfg(test)]
1978#[path = "dnszone_tests.rs"]
1979mod dnszone_tests;