Skip to content

Commit a0b0d46

Browse files
committed
fix: retry container creation on transient Docker errors
On Windows Docker Desktop, containerd's internal gRPC connection can drop after image garbage collection (e.g., following --remove-image). The daemon returns "grpc: the client connection is closing: context canceled" even though the caller's context is still valid. Add retry logic to ContainerCreate with up to 3 attempts and incremental backoff. Retries are skipped for non-transient errors (not found, invalid argument, conflict) and when the caller's context is done. ContainerCreate is idempotent, making it safe to retry.
1 parent d30e495 commit a0b0d46

1 file changed

Lines changed: 38 additions & 23 deletions

File tree

pkg/driver/docker.go

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"io"
8+
"time"
89

910
cerrdefs "github.com/containerd/errdefs"
1011
"github.com/docker/docker/api/types/build"
@@ -199,31 +200,45 @@ func (d *dockerRuntime) ContainerCreate(ctx context.Context, opts ContainerDefin
199200
volume += opts.Volumes[i].MountPath
200201
}
201202

202-
resp, err := d.cli.ContainerCreate(
203-
ctx,
204-
&container.Config{
205-
Hostname: opts.Hostname,
206-
Image: opts.Image,
207-
Cmd: opts.Command,
208-
WorkingDir: opts.WorkingDir,
209-
OpenStdin: opts.Streams.Stdin,
210-
AttachStdin: opts.Streams.Stdin,
211-
AttachStdout: opts.Streams.Stdout,
212-
AttachStderr: opts.Streams.Stderr,
213-
Tty: opts.Streams.TTY,
214-
Env: opts.Env,
215-
User: opts.User,
216-
Volumes: volumes,
217-
Entrypoint: opts.Entrypoint,
218-
},
219-
hostCfg,
220-
nil, nil, opts.ContainerName,
221-
)
222-
if err != nil {
223-
return "", err
203+
containerCfg := &container.Config{
204+
Hostname: opts.Hostname,
205+
Image: opts.Image,
206+
Cmd: opts.Command,
207+
WorkingDir: opts.WorkingDir,
208+
OpenStdin: opts.Streams.Stdin,
209+
AttachStdin: opts.Streams.Stdin,
210+
AttachStdout: opts.Streams.Stdout,
211+
AttachStderr: opts.Streams.Stderr,
212+
Tty: opts.Streams.TTY,
213+
Env: opts.Env,
214+
User: opts.User,
215+
Volumes: volumes,
216+
Entrypoint: opts.Entrypoint,
217+
}
218+
219+
// Retry on transient Docker daemon errors (e.g., containerd gRPC connection drops
220+
// after image GC on Windows). ContainerCreate is idempotent - safe to retry.
221+
const maxRetries = 3
222+
var resp container.CreateResponse
223+
var err error
224+
for attempt := range maxRetries {
225+
resp, err = d.cli.ContainerCreate(ctx, containerCfg, hostCfg, nil, nil, opts.ContainerName)
226+
if err == nil {
227+
return resp.ID, nil
228+
}
229+
// Don't retry if caller's context is done.
230+
if ctx.Err() != nil {
231+
return "", err
232+
}
233+
// Don't retry on non-transient errors.
234+
if cerrdefs.IsNotFound(err) || cerrdefs.IsInvalidArgument(err) || cerrdefs.IsConflict(err) {
235+
return "", err
236+
}
237+
launchr.Log().Debug("retrying container create after transient error", "attempt", attempt+1, "error", err)
238+
time.Sleep(time.Duration(attempt+1) * time.Second)
224239
}
225240

226-
return resp.ID, nil
241+
return "", err
227242
}
228243

229244
func (d *dockerRuntime) ContainerStart(ctx context.Context, cid string, runConfig ContainerDefinition) (<-chan int, *ContainerInOut, error) {

0 commit comments

Comments
 (0)