|
2 | 2 | // SPDX-License-Identifier: Apache-2.0 |
3 | 3 |
|
4 | 4 | use crate::RemoteOptions; |
5 | | -use crate::constants::{container_name, volume_name}; |
| 5 | +use crate::constants::{container_name, network_name, volume_name}; |
6 | 6 | use crate::image::{ |
7 | 7 | self, DEFAULT_IMAGE_REPO_BASE, DEFAULT_REGISTRY, DEFAULT_REGISTRY_USERNAME, parse_image_ref, |
8 | 8 | }; |
9 | 9 | use bollard::API_DEFAULT_VERSION; |
10 | 10 | use bollard::Docker; |
11 | 11 | use bollard::errors::Error as BollardError; |
12 | 12 | use bollard::models::{ |
13 | | - ContainerCreateBody, DeviceRequest, HostConfig, PortBinding, VolumeCreateRequest, |
| 13 | + ContainerCreateBody, DeviceRequest, HostConfig, NetworkCreateRequest, NetworkDisconnectRequest, |
| 14 | + PortBinding, VolumeCreateRequest, |
14 | 15 | }; |
15 | 16 | use bollard::query_parameters::{ |
16 | | - CreateContainerOptions, CreateImageOptions, InspectContainerOptions, |
| 17 | + CreateContainerOptions, CreateImageOptions, InspectContainerOptions, InspectNetworkOptions, |
17 | 18 | ListContainersOptionsBuilder, RemoveContainerOptions, RemoveImageOptions, RemoveVolumeOptions, |
18 | 19 | StartContainerOptions, |
19 | 20 | }; |
@@ -185,6 +186,55 @@ pub async fn find_gateway_container(docker: &Docker, port: Option<u16>) -> Resul |
185 | 186 | } |
186 | 187 | } |
187 | 188 |
|
| 189 | +/// Create a fresh Docker bridge network for the gateway. |
| 190 | +/// |
| 191 | +/// Always removes and recreates the network to guarantee a clean state. |
| 192 | +/// Stale Docker networks (e.g., from a previous interrupted destroy or |
| 193 | +/// Docker Desktop restart) can leave broken routing that causes the |
| 194 | +/// container to fail with "no default routes found". |
| 195 | +pub async fn ensure_network(docker: &Docker, net_name: &str) -> Result<()> { |
| 196 | + force_remove_network(docker, net_name).await?; |
| 197 | + |
| 198 | + // Docker may return a 409 conflict if the previous network teardown has |
| 199 | + // not fully completed in the daemon. Retry a few times with back-off, |
| 200 | + // re-attempting the removal before each create. |
| 201 | + let mut last_err = None; |
| 202 | + for attempt in 0u64..5 { |
| 203 | + if attempt > 0 { |
| 204 | + tokio::time::sleep(std::time::Duration::from_millis(500 * attempt)).await; |
| 205 | + // Re-attempt removal in case the previous teardown has now settled. |
| 206 | + force_remove_network(docker, net_name).await?; |
| 207 | + } |
| 208 | + match docker |
| 209 | + .create_network(NetworkCreateRequest { |
| 210 | + name: net_name.to_string(), |
| 211 | + driver: Some("bridge".to_string()), |
| 212 | + attachable: Some(true), |
| 213 | + ..Default::default() |
| 214 | + }) |
| 215 | + .await |
| 216 | + { |
| 217 | + Ok(_) => return Ok(()), |
| 218 | + Err(err) if is_conflict(&err) => { |
| 219 | + tracing::debug!( |
| 220 | + "Network create conflict (attempt {}/5), retrying: {}", |
| 221 | + attempt + 1, |
| 222 | + err, |
| 223 | + ); |
| 224 | + last_err = Some(err); |
| 225 | + } |
| 226 | + Err(err) => { |
| 227 | + return Err(err) |
| 228 | + .into_diagnostic() |
| 229 | + .wrap_err("failed to create Docker network"); |
| 230 | + } |
| 231 | + } |
| 232 | + } |
| 233 | + Err(last_err.expect("at least one retry attempt")) |
| 234 | + .into_diagnostic() |
| 235 | + .wrap_err("failed to create Docker network after retries (network still in use)") |
| 236 | +} |
| 237 | + |
188 | 238 | pub async fn ensure_volume(docker: &Docker, name: &str) -> Result<()> { |
189 | 239 | match docker.inspect_volume(name).await { |
190 | 240 | Ok(_) => return Ok(()), |
@@ -328,6 +378,7 @@ pub async fn ensure_container( |
328 | 378 | privileged: Some(true), |
329 | 379 | port_bindings: Some(port_bindings), |
330 | 380 | binds: Some(vec![format!("{}:/var/lib/rancher/k3s", volume_name(name))]), |
| 381 | + network_mode: Some(network_name(name)), |
331 | 382 | // Add host.docker.internal mapping for DNS resolution |
332 | 383 | // This allows the entrypoint script to configure CoreDNS to use the host gateway |
333 | 384 | extra_hosts: Some(vec!["host.docker.internal:host-gateway".to_string()]), |
@@ -629,6 +680,21 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() |
629 | 680 | .ok() |
630 | 681 | .and_then(|info| info.image); |
631 | 682 |
|
| 683 | + // Explicitly disconnect the container from the per-gateway network before |
| 684 | + // removing it. This ensures Docker tears down the network endpoint |
| 685 | + // synchronously so port bindings are released immediately and the |
| 686 | + // subsequent network cleanup sees zero connected containers. |
| 687 | + let net_name = network_name(name); |
| 688 | + let _ = docker |
| 689 | + .disconnect_network( |
| 690 | + &net_name, |
| 691 | + NetworkDisconnectRequest { |
| 692 | + container: container_name.clone(), |
| 693 | + force: Some(true), |
| 694 | + }, |
| 695 | + ) |
| 696 | + .await; |
| 697 | + |
632 | 698 | let _ = stop_container(docker, &container_name).await; |
633 | 699 |
|
634 | 700 | let remove_container = docker |
@@ -700,9 +766,52 @@ pub async fn destroy_gateway_resources(docker: &Docker, name: &str) -> Result<() |
700 | 766 | return Err(err).into_diagnostic(); |
701 | 767 | } |
702 | 768 |
|
| 769 | + // Force-remove the per-gateway network during a full destroy. First |
| 770 | + // disconnect any stale endpoints that Docker may still report (race |
| 771 | + // between container removal and network bookkeeping), then remove the |
| 772 | + // network itself. |
| 773 | + force_remove_network(docker, &net_name).await?; |
| 774 | + |
703 | 775 | Ok(()) |
704 | 776 | } |
705 | 777 |
|
| 778 | +/// Forcefully remove a Docker network, disconnecting any remaining |
| 779 | +/// containers first. This ensures that stale Docker network endpoints |
| 780 | +/// cannot prevent port bindings from being released. |
| 781 | +async fn force_remove_network(docker: &Docker, net_name: &str) -> Result<()> { |
| 782 | + let network = match docker |
| 783 | + .inspect_network(net_name, None::<InspectNetworkOptions>) |
| 784 | + .await |
| 785 | + { |
| 786 | + Ok(info) => info, |
| 787 | + Err(err) if is_not_found(&err) => return Ok(()), |
| 788 | + Err(err) => return Err(err).into_diagnostic(), |
| 789 | + }; |
| 790 | + |
| 791 | + // Disconnect any containers still attached to the network. |
| 792 | + if let Some(containers) = network.containers { |
| 793 | + for (id, _) in containers { |
| 794 | + let _ = docker |
| 795 | + .disconnect_network( |
| 796 | + net_name, |
| 797 | + NetworkDisconnectRequest { |
| 798 | + container: id, |
| 799 | + force: Some(true), |
| 800 | + }, |
| 801 | + ) |
| 802 | + .await; |
| 803 | + } |
| 804 | + } |
| 805 | + |
| 806 | + match docker.remove_network(net_name).await { |
| 807 | + Ok(()) => Ok(()), |
| 808 | + Err(err) if is_not_found(&err) => Ok(()), |
| 809 | + Err(err) => Err(err) |
| 810 | + .into_diagnostic() |
| 811 | + .wrap_err("failed to remove Docker network"), |
| 812 | + } |
| 813 | +} |
| 814 | + |
706 | 815 | fn is_not_found(err: &BollardError) -> bool { |
707 | 816 | matches!( |
708 | 817 | err, |
|
0 commit comments