@@ -21,6 +21,7 @@ package container
2121import (
2222 "bytes"
2323 "context"
24+ "encoding/csv"
2425 "fmt"
2526 "io"
2627 "os"
@@ -35,6 +36,7 @@ import (
3536 "github.com/docker/docker/pkg/stdcopy"
3637 "github.com/docker/go-connections/nat"
3738 "github.com/pkg/errors"
39+ "k8s.io/utils/pointer"
3840
3941 clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
4042)
@@ -43,6 +45,10 @@ const (
4345 httpProxy = "HTTP_PROXY"
4446 httpsProxy = "HTTPS_PROXY"
4547 noProxy = "NO_PROXY"
48+
49+ btrfsStorage = "btrfs"
50+ zfsStorage = "zfs"
51+ xfsStorage = "xfs"
4652)
4753
4854type dockerRuntime struct {
@@ -372,8 +378,10 @@ func (d *dockerRuntime) RunContainer(ctx context.Context, runConfig *RunContaine
372378 }
373379
374380 restartPolicy := runConfig .RestartPolicy
381+ restartMaximumRetryCount := 0
375382 if restartPolicy == "" {
376- restartPolicy = "unless-stopped"
383+ restartPolicy = "on-failure"
384+ restartMaximumRetryCount = 1
377385 }
378386
379387 hostConfig := dockercontainer.HostConfig {
@@ -383,11 +391,12 @@ func (d *dockerRuntime) RunContainer(ctx context.Context, runConfig *RunContaine
383391 // including some ones docker would otherwise do by default.
384392 // for now this is what we want. in the future we may revisit this.
385393 Privileged : true ,
386- SecurityOpt : []string {"seccomp=unconfined" }, // ignore seccomp
394+ SecurityOpt : []string {"seccomp=unconfined" , "apparmor=unconfined" }, // ignore seccomp
387395 NetworkMode : dockercontainer .NetworkMode (runConfig .Network ),
388396 Tmpfs : runConfig .Tmpfs ,
389397 PortBindings : nat.PortMap {},
390- RestartPolicy : dockercontainer.RestartPolicy {Name : restartPolicy },
398+ RestartPolicy : dockercontainer.RestartPolicy {Name : restartPolicy , MaximumRetryCount : restartMaximumRetryCount },
399+ Init : pointer .Bool (false ),
391400 }
392401 networkConfig := network.NetworkingConfig {}
393402
@@ -398,21 +407,21 @@ func (d *dockerRuntime) RunContainer(ctx context.Context, runConfig *RunContaine
398407 }
399408 }
400409
401- // mount /dev/mapper if docker storage driver if Btrfs or ZFS
402- // https://github.com/kubernetes-sigs/kind/pull/1464
403- needed , err := d .needsDevMapper (ctx )
410+ info , err := d .dockerClient .Info (ctx )
404411 if err != nil {
405412 return errors .Wrapf (err , "unable to get Docker engine info, failed to create container %q" , runConfig .Name )
406413 }
407414
408- if needed {
415+ // mount /dev/mapper if docker storage driver if Btrfs or ZFS
416+ // https://github.com/kubernetes-sigs/kind/pull/1464
417+ if d .needsDevMapper (info ) {
409418 hostConfig .Binds = append (hostConfig .Binds , "/dev/mapper:/dev/mapper:ro" )
410419 }
411420
412421 envVars := environmentVariables (runConfig )
413422
414423 // pass proxy environment variables to be used by node's docker daemon
415- proxyDetails , err := d .getProxyDetails (ctx , runConfig .Network )
424+ proxyDetails , err := d .getProxyDetails (ctx , runConfig .Network , runConfig . Name )
416425 if err != nil {
417426 return errors .Wrapf (err , "error getting subnets for %q" , runConfig .Network )
418427 }
@@ -421,15 +430,30 @@ func (d *dockerRuntime) RunContainer(ctx context.Context, runConfig *RunContaine
421430 }
422431 containerConfig .Env = envVars
423432
433+ // handle Docker on Btrfs or ZFS
434+ // https://github.com/kubernetes-sigs/kind/issues/1416#issuecomment-606514724
435+ if d .mountDevMapper (info ) {
436+ runConfig .Mounts = append (runConfig .Mounts , Mount {
437+ Source : "/dev/mapper" ,
438+ Target : "/dev/mapper" ,
439+ })
440+ }
441+
424442 configureVolumes (runConfig , & containerConfig , & hostConfig )
425443 configurePortMappings (runConfig .PortMappings , & containerConfig , & hostConfig )
426444
427- if d .usernsRemap (ctx ) {
445+ if d .usernsRemap (info ) {
428446 // We need this argument in order to make this command work
429447 // in systems that have userns-remap enabled on the docker daemon
430448 hostConfig .UsernsMode = "host"
431449 }
432450
451+ // enable /dev/fuse explicitly for fuse-overlayfs
452+ // (Rootless Docker does not automatically mount /dev/fuse with --privileged)
453+ if d .mountFuse (info ) {
454+ hostConfig .Devices = append (hostConfig .Devices , dockercontainer.DeviceMapping {PathOnHost : "/dev/fuse" })
455+ }
456+
433457 // Make sure we have the image
434458 if err := d .PullContainerImageIfNotExists (ctx , runConfig .Image ); err != nil {
435459 return errors .Wrapf (err , "error pulling container image %s" , runConfig .Image )
@@ -511,13 +535,8 @@ func (d *dockerRuntime) RunContainer(ctx context.Context, runConfig *RunContaine
511535// needsDevMapper checks whether we need to mount /dev/mapper.
512536// This is required when the docker storage driver is Btrfs or ZFS.
513537// https://github.com/kubernetes-sigs/kind/pull/1464
514- func (d * dockerRuntime ) needsDevMapper (ctx context.Context ) (bool , error ) {
515- info , err := d .dockerClient .Info (ctx )
516- if err != nil {
517- return false , err
518- }
519-
520- return info .Driver == "btrfs" || info .Driver == "zfs" , nil
538+ func (d * dockerRuntime ) needsDevMapper (info types.Info ) bool {
539+ return info .Driver == btrfsStorage || info .Driver == zfsStorage
521540}
522541
523542// ownerAndGroup gets the user configuration for the container (user:group).
@@ -601,7 +620,7 @@ type proxyDetails struct {
601620
602621// getProxyDetails returns a struct with the host environment proxy settings
603622// that should be passed to the nodes.
604- func (d * dockerRuntime ) getProxyDetails (ctx context.Context , network string ) (* proxyDetails , error ) {
623+ func (d * dockerRuntime ) getProxyDetails (ctx context.Context , network string , nodeNames ... string ) (* proxyDetails , error ) {
605624 var val string
606625 details := proxyDetails {Envs : make (map [string ]string )}
607626 proxyEnvs := []string {httpProxy , httpsProxy , noProxy }
@@ -626,21 +645,24 @@ func (d *dockerRuntime) getProxyDetails(ctx context.Context, network string) (*p
626645 if err != nil {
627646 return & details , err
628647 }
629- noProxyList := strings .Join (append (subnets , details .Envs [noProxy ]), "," )
630- details .Envs [noProxy ] = noProxyList
631- details .Envs [strings .ToLower (noProxy )] = noProxyList
648+ noProxyList := append (subnets , details .Envs [noProxy ])
649+ noProxyList = append (noProxyList , nodeNames ... )
650+ // Add pod and service dns names to no_proxy to allow in cluster
651+ // Note: this is best effort based on the default CoreDNS spec
652+ // https://github.com/kubernetes/dns/blob/master/docs/specification.md
653+ // Any user created pod/service hostnames, namespaces, custom DNS services
654+ // are expected to be no-proxied by the user explicitly.
655+ noProxyList = append (noProxyList , ".svc" , ".svc.cluster" , ".svc.cluster.local" )
656+ noProxyJoined := strings .Join (noProxyList , "," )
657+ details .Envs [noProxy ] = noProxyJoined
658+ details .Envs [strings .ToLower (noProxy )] = noProxyJoined
632659 }
633660
634661 return & details , nil
635662}
636663
637664// usernsRemap checks if userns-remap is enabled in dockerd.
638- func (d * dockerRuntime ) usernsRemap (ctx context.Context ) bool {
639- info , err := d .dockerClient .Info (ctx )
640- if err != nil {
641- return false
642- }
643-
665+ func (d * dockerRuntime ) usernsRemap (info types.Info ) bool {
644666 for _ , secOpt := range info .SecurityOptions {
645667 if strings .Contains (secOpt , "name=userns" ) {
646668 return true
@@ -649,6 +671,49 @@ func (d *dockerRuntime) usernsRemap(ctx context.Context) bool {
649671 return false
650672}
651673
674+ // mountDevMapper checks if the Docker storage driver is Btrfs or ZFS
675+ // or if the backing filesystem is Btrfs.
676+ func (d * dockerRuntime ) mountDevMapper (info types.Info ) bool {
677+ storage := ""
678+ storage = strings .ToLower (strings .TrimSpace (info .Driver ))
679+ if storage == btrfsStorage || storage == zfsStorage || storage == "devicemapper" {
680+ return true
681+ }
682+
683+ // check the backing file system
684+ // docker info -f '{{json .DriverStatus }}'
685+ // [["Backing Filesystem","extfs"],["Supports d_type","true"],["Native Overlay Diff","true"]]
686+ for _ , item := range info .DriverStatus {
687+ if item [0 ] == "Backing Filesystem" {
688+ storage = strings .ToLower (item [1 ])
689+ break
690+ }
691+ }
692+
693+ return storage == btrfsStorage || storage == zfsStorage || storage == xfsStorage
694+ }
695+
696+ // rootless: use fuse-overlayfs by default
697+ // https://github.com/kubernetes-sigs/kind/issues/2275
698+ func (d * dockerRuntime ) mountFuse (info types.Info ) bool {
699+ for _ , o := range info .SecurityOptions {
700+ // o is like "name=seccomp,profile=default", or "name=rootless",
701+ csvReader := csv .NewReader (strings .NewReader (o ))
702+ sliceSlice , err := csvReader .ReadAll ()
703+ if err != nil {
704+ return false
705+ }
706+ for _ , f := range sliceSlice {
707+ for _ , ff := range f {
708+ if ff == "name=rootless" {
709+ return true
710+ }
711+ }
712+ }
713+ }
714+ return false
715+ }
716+
652717func isSELinuxEnforcing () bool {
653718 dat , err := os .ReadFile ("/sys/fs/selinux/enforce" )
654719 if err != nil {
0 commit comments