Skip to content

fix: unreliable webhook behaviour on gatekeeper pod startup and shutdown #3780

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
35 changes: 34 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ import (
"net/http"
_ "net/http/pprof"
"os"
"os/signal"
"path/filepath"
"syscall"
"time"

"github.com/go-logr/zapr"
Expand Down Expand Up @@ -117,6 +119,7 @@ var (
enableK8sCel = flag.Bool("enable-k8s-native-validation", true, "enable the validating admission policy driver")
externaldataProviderResponseCacheTTL = flag.Duration("external-data-provider-response-cache-ttl", 3*time.Minute, "TTL for the external data provider response cache. Specify the duration in 'h', 'm', or 's' for hours, minutes, or seconds respectively. Defaults to 3 minutes if unspecified. Setting the TTL to 0 disables the cache.")
enableReferential = flag.Bool("enable-referential-rules", true, "Enable referential rules. This flag defaults to true. Set this value to false if you want to disallow referential constraints. Because referential constraints read objects other than the object-under-test, they may be subject to race conditions. Users concerned about this may want to disable referential rules")
shutdownDelay = flag.Int("shutdown-delay", 10, "Time in seconds the controller runtime shutdown gets delayed after receiving a pod termination event. Prevents failing webhooks on pod shutdown. default: 10")
)

func init() {
Expand Down Expand Up @@ -309,9 +312,39 @@ func innerMain() int {
}
}

// Always enable downstream checking for the webhooks, if enabled.
if len(webhooks) > 0 {
tlsChecker := webhook.NewTLSChecker(*certDir, *port)
setupLog.Info("setting up TLS readiness probe")
if err := mgr.AddReadyzCheck("tls-check", tlsChecker); err != nil {
setupLog.Error(err, "unable to create tls readiness check")
return 1
}
}

// Setup controllers asynchronously, they will block for certificate generation if needed.
setupErr := make(chan error)
ctx := ctrl.SetupSignalHandler()

// Setup termination with grace period. Required to give K8s Services time to disconnect the Pod endpoint on termination.
// Derived from how the controller-runtime sets up a signal handler with ctrl.SetupSignalHandler()
// controller-runtime upstream issue: https://github.com/kubernetes-sigs/controller-runtime/issues/3113
ctx, cancel := context.WithCancel(context.Background())

c := make(chan os.Signal, 2)
signal.Notify(c, []os.Signal{os.Interrupt, syscall.SIGTERM}...)
go func() {
<-c
setupLog.Info(fmt.Sprintf("Shutting Down, waiting for %vs", shutdownDelay))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
setupLog.Info(fmt.Sprintf("Shutting Down, waiting for %vs", shutdownDelay))
setupLog.Info(fmt.Sprintf("Shutting Down, waiting for %ds", *shutdownDelay))

go func() {
time.Sleep(time.Duration(*shutdownDelay) * time.Second)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly we should add a test to validate this:
e.g.

# Get the name of the Gatekeeper pod
POD_NAME=$(kubectl get pods -n ${GATEKEEPER_NAMESPACE} -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')

# Trigger the termination of the Gatekeeper pod
TERMINATION_START=$(date +%s)
kubectl delete pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE}

# Monitor the termination process to ensure the grace period is respected
echo "Monitoring the termination process..."
kubectl get pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} -w --ignore-not-found &

# Wait for the pod to be fully terminated
kubectl wait --for=delete pod/${POD_NAME} -n ${GATEKEEPER_NAMESPACE} --timeout=120s

TERMINATION_END=$(date +%s)
TERMINATION_DURATION=$((TERMINATION_END - TERMINATION_START))

# Validate that the pod is terminated after the specified grace period
GRACE_PERIOD=10
if [ "${TERMINATION_DURATION}" -ge "${GRACE_PERIOD}" ]; then
    echo "Pod termination respected the grace period of ${GRACE_PERIOD} seconds."
else
    echo "Pod termination did not respect the grace period. Termination duration: ${TERMINATION_DURATION} seconds."
fi

Copy link
Author

@l0wl3vel l0wl3vel May 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test added

setupLog.Info("Shutdown grace period finished")
cancel()
}()
<-c
setupLog.Info("Second signal received, killing now")
os.Exit(1) // second signal. Exit directly.
}()

go func() {
setupErr <- setupControllers(ctx, mgr, tracker, setupFinished)
}()
Expand Down
28 changes: 28 additions & 0 deletions test/bats/test.bats
Original file line number Diff line number Diff line change
Expand Up @@ -693,3 +693,31 @@ __expansion_audit_test() {
assert_failure
wait_for_process ${WAIT_TIME} ${SLEEP_TIME} "kubectl delete --ignore-not-found -f ${BATS_TESTS_DIR}/templates/k8srequiredlabels_template_regov1.yaml"
}

@test "shutdown delay" {
# Get the name of the Gatekeeper pod
POD_NAME=$(kubectl get pods -n ${GATEKEEPER_NAMESPACE} -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')

# Trigger the termination of the Gatekeeper pod
TERMINATION_START=$(date +%s)
run kubectl delete pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE}

# Monitor the termination process to ensure the grace period is respected
echo "Monitoring the termination process..."
run kubectl get pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} -w --ignore-not-found

# Wait for the pod to be fully terminated
run kubectl wait --for=delete pod/${POD_NAME} -n ${GATEKEEPER_NAMESPACE} --timeout=120s

TERMINATION_END=$(date +%s)
TERMINATION_DURATION=$((TERMINATION_END - TERMINATION_START))

# Validate that the pod is terminated after the specified grace period
GRACE_PERIOD=10
if [ "${TERMINATION_DURATION}" -ge "${GRACE_PERIOD}" ]; then
echo "Pod termination respected the grace period of ${GRACE_PERIOD} seconds."
else
echo "Pod termination did not respect the grace period. Termination duration: ${TERMINATION_DURATION} seconds."
assert_failure
fi
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
}
}