-
Notifications
You must be signed in to change notification settings - Fork 789
fix: unreliable webhook behaviour on gatekeeper pod startup and shutdown #3780
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 10 commits
ebd1d12
2de0c78
5b4169d
e7744b2
5c03825
7da14f3
f98ea1c
5eb7562
3fcc23e
998ce88
82cb349
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -26,7 +26,9 @@ import ( | |||||
"net/http" | ||||||
_ "net/http/pprof" | ||||||
"os" | ||||||
"os/signal" | ||||||
"path/filepath" | ||||||
"syscall" | ||||||
"time" | ||||||
|
||||||
"github.com/go-logr/zapr" | ||||||
|
@@ -117,6 +119,7 @@ var ( | |||||
enableK8sCel = flag.Bool("enable-k8s-native-validation", true, "enable the validating admission policy driver") | ||||||
externaldataProviderResponseCacheTTL = flag.Duration("external-data-provider-response-cache-ttl", 3*time.Minute, "TTL for the external data provider response cache. Specify the duration in 'h', 'm', or 's' for hours, minutes, or seconds respectively. Defaults to 3 minutes if unspecified. Setting the TTL to 0 disables the cache.") | ||||||
enableReferential = flag.Bool("enable-referential-rules", true, "Enable referential rules. This flag defaults to true. Set this value to false if you want to disallow referential constraints. Because referential constraints read objects other than the object-under-test, they may be subject to race conditions. Users concerned about this may want to disable referential rules") | ||||||
shutdownDelay = flag.Int("shutdown-delay", 10, "Time in seconds the controller runtime shutdown gets delayed after receiving a pod termination event. Prevents failing webhooks on pod shutdown. default: 10") | ||||||
) | ||||||
|
||||||
func init() { | ||||||
|
@@ -309,9 +312,39 @@ func innerMain() int { | |||||
} | ||||||
} | ||||||
|
||||||
// Always enable downstream checking for the webhooks, if enabled. | ||||||
if len(webhooks) > 0 { | ||||||
tlsChecker := webhook.NewTLSChecker(*certDir, *port) | ||||||
setupLog.Info("setting up TLS readiness probe") | ||||||
if err := mgr.AddReadyzCheck("tls-check", tlsChecker); err != nil { | ||||||
setupLog.Error(err, "unable to create tls readiness check") | ||||||
return 1 | ||||||
} | ||||||
} | ||||||
|
||||||
// Setup controllers asynchronously, they will block for certificate generation if needed. | ||||||
setupErr := make(chan error) | ||||||
ctx := ctrl.SetupSignalHandler() | ||||||
|
||||||
// Setup termination with grace period. Required to give K8s Services time to disconnect the Pod endpoint on termination. | ||||||
// Derived from how the controller-runtime sets up a signal handler with ctrl.SetupSignalHandler() | ||||||
// controller-runtime upstream issue: https://github.com/kubernetes-sigs/controller-runtime/issues/3113 | ||||||
ctx, cancel := context.WithCancel(context.Background()) | ||||||
|
||||||
c := make(chan os.Signal, 2) | ||||||
signal.Notify(c, []os.Signal{os.Interrupt, syscall.SIGTERM}...) | ||||||
go func() { | ||||||
<-c | ||||||
setupLog.Info(fmt.Sprintf("Shutting Down, waiting for %vs", shutdownDelay)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
go func() { | ||||||
time.Sleep(time.Duration(*shutdownDelay) * time.Second) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly we should add a test to validate this: # Get the name of the Gatekeeper pod
POD_NAME=$(kubectl get pods -n ${GATEKEEPER_NAMESPACE} -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
# Trigger the termination of the Gatekeeper pod
TERMINATION_START=$(date +%s)
kubectl delete pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE}
# Monitor the termination process to ensure the grace period is respected
echo "Monitoring the termination process..."
kubectl get pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} -w --ignore-not-found &
# Wait for the pod to be fully terminated
kubectl wait --for=delete pod/${POD_NAME} -n ${GATEKEEPER_NAMESPACE} --timeout=120s
TERMINATION_END=$(date +%s)
TERMINATION_DURATION=$((TERMINATION_END - TERMINATION_START))
# Validate that the pod is terminated after the specified grace period
GRACE_PERIOD=10
if [ "${TERMINATION_DURATION}" -ge "${GRACE_PERIOD}" ]; then
echo "Pod termination respected the grace period of ${GRACE_PERIOD} seconds."
else
echo "Pod termination did not respect the grace period. Termination duration: ${TERMINATION_DURATION} seconds."
fi There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test added |
||||||
setupLog.Info("Shutdown grace period finished") | ||||||
cancel() | ||||||
}() | ||||||
<-c | ||||||
setupLog.Info("Second signal received, killing now") | ||||||
os.Exit(1) // second signal. Exit directly. | ||||||
}() | ||||||
|
||||||
go func() { | ||||||
setupErr <- setupControllers(ctx, mgr, tracker, setupFinished) | ||||||
}() | ||||||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -693,3 +693,31 @@ __expansion_audit_test() { | |||||||
assert_failure | ||||||||
wait_for_process ${WAIT_TIME} ${SLEEP_TIME} "kubectl delete --ignore-not-found -f ${BATS_TESTS_DIR}/templates/k8srequiredlabels_template_regov1.yaml" | ||||||||
} | ||||||||
|
||||||||
@test "shutdown delay" { | ||||||||
# Get the name of the Gatekeeper pod | ||||||||
POD_NAME=$(kubectl get pods -n ${GATEKEEPER_NAMESPACE} -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}') | ||||||||
|
||||||||
# Trigger the termination of the Gatekeeper pod | ||||||||
TERMINATION_START=$(date +%s) | ||||||||
run kubectl delete pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} | ||||||||
|
||||||||
# Monitor the termination process to ensure the grace period is respected | ||||||||
echo "Monitoring the termination process..." | ||||||||
run kubectl get pod ${POD_NAME} -n ${GATEKEEPER_NAMESPACE} -w --ignore-not-found | ||||||||
|
||||||||
# Wait for the pod to be fully terminated | ||||||||
run kubectl wait --for=delete pod/${POD_NAME} -n ${GATEKEEPER_NAMESPACE} --timeout=120s | ||||||||
|
||||||||
TERMINATION_END=$(date +%s) | ||||||||
TERMINATION_DURATION=$((TERMINATION_END - TERMINATION_START)) | ||||||||
|
||||||||
# Validate that the pod is terminated after the specified grace period | ||||||||
GRACE_PERIOD=10 | ||||||||
if [ "${TERMINATION_DURATION}" -ge "${GRACE_PERIOD}" ]; then | ||||||||
echo "Pod termination respected the grace period of ${GRACE_PERIOD} seconds." | ||||||||
else | ||||||||
echo "Pod termination did not respect the grace period. Termination duration: ${TERMINATION_DURATION} seconds." | ||||||||
assert_failure | ||||||||
fi | ||||||||
} | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
Uh oh!
There was an error while loading. Please reload this page.