Skip to content

Commit 246b188

Browse files
Ideas for improving logs
1 parent ccc5be7 commit 246b188

12 files changed

Lines changed: 108 additions & 5 deletions

File tree

bootstrap/kubeadm/internal/controllers/kubeadmconfig_controller.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,13 @@ func (r *KubeadmConfigReconciler) refreshBootstrapTokenIfNeeded(ctx context.Cont
368368

369369
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
370370
if err != nil {
371+
// If connection is down, ignore the error (there is nothing we can do, the token might expire before connection is back up).
372+
// Note: when connection state will change, a reconcile will be triggered automatically.
373+
// FIXME: log (without making it noisy)
374+
// if errors.Is(err, clustercache.ErrClusterNotConnected) {
375+
// return ctrl.Result{}, nil
376+
// }
377+
371378
return ctrl.Result{}, err
372379
}
373380

@@ -440,6 +447,13 @@ func (r *KubeadmConfigReconciler) rotateMachinePoolBootstrapToken(ctx context.Co
440447
log.V(2).Info("Config is owned by a MachinePool, checking if token should be rotated")
441448
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
442449
if err != nil {
450+
// If connection is down, ignore the error (there is nothing we can do, the token might expire before connection is back up).
451+
// Note: when connection state will change, a reconcile will be triggered automatically.
452+
// FIXME: log (without making it noisy)
453+
// if errors.Is(err, clustercache.ErrClusterNotConnected) {
454+
// return ctrl.Result{}, nil
455+
// }
456+
443457
return ctrl.Result{}, err
444458
}
445459

@@ -684,7 +698,17 @@ func (r *KubeadmConfigReconciler) joinWorker(ctx context.Context, scope *Scope)
684698
} else if !res.IsZero() {
685699
return res, nil
686700
}
687-
701+
//
702+
// if res, err := r.reconcileDiscovery(ctx, scope.Cluster, scope.Config, certificates); err != nil || !res.IsZero() {
703+
// return res, err
704+
// }
705+
//
706+
// // If at this point there is not a discovery configuration, it does not make sense to continue.
707+
// if !(scope.Config.Spec.JoinConfiguration.Discovery.File.KubeConfig.IsDefined() || scope.Config.Spec.JoinConfiguration.Discovery.BootstrapToken.Token != "") {
708+
// // FIXME: log (without making it noisy)
709+
// return ctrl.Result{}, nil
710+
// }
711+
//
688712
kubernetesVersion := scope.ConfigOwner.KubernetesVersion()
689713
parsedVersion, err := semver.ParseTolerant(kubernetesVersion)
690714
if err != nil {
@@ -860,6 +884,16 @@ func (r *KubeadmConfigReconciler) joinControlplane(ctx context.Context, scope *S
860884
return res, nil
861885
}
862886

887+
// if res, err := r.reconcileDiscovery(ctx, scope.Cluster, scope.Config, certificates); err != nil || !res.IsZero() {
888+
// return res, err
889+
// }
890+
//
891+
// // If at this point there is not a discovery configuration, it does not make sense to continue.
892+
// if !(scope.Config.Spec.JoinConfiguration.Discovery.File.KubeConfig.IsDefined() || scope.Config.Spec.JoinConfiguration.Discovery.BootstrapToken.Token != "") {
893+
// // FIXME: log (without making it noisy)
894+
// return ctrl.Result{}, nil
895+
// }
896+
//
863897
kubernetesVersion := scope.ConfigOwner.KubernetesVersion()
864898
parsedVersion, err := semver.ParseTolerant(kubernetesVersion)
865899
if err != nil {
@@ -1246,6 +1280,13 @@ func (r *KubeadmConfigReconciler) reconcileDiscovery(ctx context.Context, cluste
12461280
if config.Spec.JoinConfiguration.Discovery.BootstrapToken.Token == "" {
12471281
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
12481282
if err != nil {
1283+
// If connection is down, ignore the error (the token will be created as soon as connection is up again).
1284+
// Note: when connection state will change, a reconcile will be triggered automatically.
1285+
// FIXME: log (without making it noisy)
1286+
if errors.Is(err, clustercache.ErrClusterNotConnected) {
1287+
return ctrl.Result{}, nil
1288+
}
1289+
12491290
return ctrl.Result{}, err
12501291
}
12511292

bootstrap/kubeadm/internal/locking/control_plane_init_mutex.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,9 @@ func (c *ControlPlaneInitMutex) Lock(ctx context.Context, cluster *clusterv1.Clu
8383
c.Unlock(ctx, cluster)
8484
}
8585
}
86-
log.Info(fmt.Sprintf("Waiting for Machine %s to initialize", info.MachineName))
86+
87+
// FIXME: exponential slow down
88+
log.Info(fmt.Sprintf("Waiting for Machine %s to initialize", klog.KRef(cluster.Namespace, info.MachineName)))
8789
return false
8890
}
8991

controllers/clustercache/cluster_cache.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,9 @@ func (cc *clusterCache) Reconcile(ctx context.Context, req reconcile.Request) (r
521521
if connected {
522522
healthCheckingState := accessor.GetHealthCheckingState(ctx)
523523

524+
// FIXME: "exponential slow down"
525+
// e.g. 10 tries at accessor.config.HealthProbe.Interval (30 sec each, 5m) --> (# of tries % 10) 2 * interval
526+
524527
// Requeue, if health probe was already run within the HealthProbe.Interval.
525528
if requeueAfter, requeue := shouldRequeue(time.Now(), healthCheckingState.LastProbeTime, accessor.config.HealthProbe.Interval); requeue {
526529
log.V(6).Info(fmt.Sprintf("Requeuing after %s as health probe was already run within the last %s",

controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
203203
return ctrl.Result{}, errors.Wrapf(err, "failed to retrieve owner Cluster")
204204
}
205205
if cluster == nil {
206+
// FIXME: wait some time after creation before starting logging + exponential slow down
206207
log.Info("Cluster Controller has not yet set OwnerRef")
207208
return ctrl.Result{}, nil
208209
}
@@ -235,6 +236,8 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
235236
defer func() {
236237
// Always attempt to update status.
237238
if err := r.updateStatus(ctx, controlPlane); err != nil {
239+
// FIXME: check this.
240+
//
238241
var connFailure *internal.RemoteClusterConnectionError
239242
if errors.As(err, &connFailure) {
240243
log.Info(fmt.Sprintf("Could not connect to workload cluster to fetch status: %s", err.Error()))
@@ -244,6 +247,7 @@ func (r *KubeadmControlPlaneReconciler) Reconcile(ctx context.Context, req ctrl.
244247
}
245248

246249
if err := r.updateV1Beta1Status(ctx, controlPlane); err != nil {
250+
// FIXME: check this.
247251
var connFailure *internal.RemoteClusterConnectionError
248252
if errors.As(err, &connFailure) {
249253
log.Info(fmt.Sprintf("Could not connect to workload cluster to fetch deprecated v1beta1 status: %s", err.Error()))
@@ -411,6 +415,7 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
411415
Message: "Waiting for Cluster status.infrastructureReady to be true",
412416
})
413417

418+
// FIXME: exponential slow down
414419
log.Info("Cluster infrastructure is not ready yet")
415420
return ctrl.Result{}, nil
416421
}
@@ -1030,6 +1035,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneAndMachinesConditio
10301035
EtcdMemberHealthyReason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberConnectionDownReason,
10311036
Message: "Remote connection not established yet",
10321037
})
1038+
// FIXME: What to do here? (is this an error?)
10331039
return errors.Errorf("connection to the workload cluster not established yet")
10341040
}
10351041

@@ -1045,6 +1051,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneAndMachinesConditio
10451051
EtcdMemberHealthyReason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberConnectionDownReason,
10461052
Message: lastProbeSuccessMessage(healthCheckingState.LastProbeSuccessTime),
10471053
})
1054+
// FIXME: What to do here?
10481055
return errors.Errorf("connection to the workload cluster is down")
10491056
}
10501057

controlplane/kubeadm/internal/controllers/scale.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,8 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, con
213213
controlPlane.PreflightCheckResults.TopologyVersionMismatch = true
214214
// Slow down reconcile frequency, as deferring a version upgrade waits for slow processes,
215215
// e.g. workers are completing a previous upgrade step.
216+
217+
// FIXME: "exponential slow down" (same below)
216218
r.controller.DeferNextReconcileForObject(controlPlane.KCP, time.Now().Add(5*time.Second))
217219
return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}
218220
}
@@ -256,6 +258,7 @@ func (r *KubeadmControlPlaneReconciler) preflightChecks(ctx context.Context, con
256258
}
257259

258260
if err != nil {
261+
// FIXME: exponential slow down
259262
r.recorder.Eventf(controlPlane.KCP, corev1.EventTypeWarning, "ControlPlaneUnhealthy",
260263
"Waiting for control plane to pass preflight checks to continue reconciliation: %v", err)
261264
log.Info("Waiting for control plane to pass preflight checks", "failures", err.Error())

internal/controllers/cluster/cluster_controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (retRes ct
212212
if cluster.Spec.Topology.IsDefined() {
213213
if !cluster.Spec.ControlPlaneRef.IsDefined() || !cluster.Spec.InfrastructureRef.IsDefined() {
214214
// TODO: add a condition to surface this scenario
215+
216+
// FIXME: Delay this message after x seconds from creation.
215217
log.Info("Waiting for the topology to be generated")
216218
return ctrl.Result{}, nil
217219
}

internal/controllers/machine/machine_controller.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
160160
predicates.ClusterControlPlaneInitialized(mgr.GetScheme(), *r.predicateLog),
161161
predicates.ResourceHasFilterLabel(mgr.GetScheme(), *r.predicateLog, r.WatchFilterValue),
162162
).
163+
// FIXME: Check WatchForProbeFailure
163164
WatchesRawSource(r.ClusterCache.GetClusterSource("machine", clusterToMachines, clustercache.WatchForProbeFailure(r.RemoteConditionsGracePeriod))).
164165
Watches(
165166
&clusterv1.MachineSet{},
@@ -769,7 +770,9 @@ func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1
769770
// even if the node cannot be retrieved.
770771
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
771772
if err != nil {
773+
// if !errors.Is(err, clustercache.ErrClusterNotConnected) {
772774
log.Error(err, "Failed to get cluster client while deleting Machine and checking for nodes")
775+
// }
773776
} else {
774777
node, err := r.getNode(ctx, remoteClient, providerID)
775778
if err != nil && err != ErrNodeNotFound {
@@ -844,8 +847,12 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro
844847
ctx = ctrl.LoggerInto(ctx, log)
845848

846849
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
850+
// FIXME: what we do if connection is down?
851+
// - Keep logging error + wait for drain timeout (if set)
852+
// - Slow down reconcile
853+
// - ??
847854
if err != nil {
848-
return ctrl.Result{}, errors.Wrapf(err, "failed to drain Node %s", nodeName)
855+
return ctrl.Result{}, errors.WithMessagef(err, "failed to drain Node %s", nodeName)
849856
}
850857

851858
node := &corev1.Node{}
@@ -947,6 +954,10 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, s *scope) (ct
947954
machine := s.machine
948955

949956
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
957+
// FIXME: what we do if connection is down?
958+
// - Keep logging error + wait for volume detach timeout (if set)
959+
// - Slow down reconcile
960+
// - ??
950961
if err != nil {
951962
return ctrl.Result{}, err
952963
}
@@ -1012,6 +1023,10 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, s *scope) (ct
10121023

10131024
func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error {
10141025
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
1026+
// FIXME: what we do if connection is down?
1027+
// - Keep logging error + wait for timeout (if set)
1028+
// - Slow down reconcile
1029+
// - ??
10151030
if err != nil {
10161031
return errors.Wrapf(err, "failed deleting Node because connection to the workload cluster is down")
10171032
}
@@ -1097,13 +1112,20 @@ func (r *Reconciler) watchClusterNodes(ctx context.Context, cluster *clusterv1.C
10971112
return nil
10981113
}
10991114

1100-
return r.ClusterCache.Watch(ctx, util.ObjectKey(cluster), clustercache.NewWatcher(clustercache.WatcherOptions{
1115+
err := r.ClusterCache.Watch(ctx, util.ObjectKey(cluster), clustercache.NewWatcher(clustercache.WatcherOptions{
11011116
Name: "machine-watchNodes",
11021117
Watcher: r.controller,
11031118
Kind: &corev1.Node{},
11041119
EventHandler: handler.EnqueueRequestsFromMapFunc(r.nodeToMachine),
11051120
Predicates: []predicate.TypedPredicate[client.Object]{predicates.TypedResourceIsChanged[client.Object](r.Client.Scheme(), *r.predicateLog)},
11061121
}))
1122+
1123+
// If connection is down, ignore the error (the watch will be created as soon as connection is up again).
1124+
// Note: when connection state will change, a reconcile will be triggered automatically.
1125+
//if errors.Is(err, clustercache.ErrClusterNotConnected) {
1126+
// return nil
1127+
//}
1128+
return err
11071129
}
11081130

11091131
func (r *Reconciler) nodeToMachine(ctx context.Context, o client.Object) []reconcile.Request {

internal/controllers/machine/machine_controller_noderef.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/apimachinery/pkg/runtime/schema"
2929
"k8s.io/apimachinery/pkg/util/sets"
3030
"k8s.io/klog/v2"
31+
"sigs.k8s.io/cluster-api/controllers/clustercache"
3132
ctrl "sigs.k8s.io/controller-runtime"
3233
"sigs.k8s.io/controller-runtime/pkg/client"
3334

@@ -75,6 +76,15 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
7576

7677
remoteClient, err := r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster))
7778
if err != nil {
79+
// FIXME: think a little bit more about this.
80+
// it is probably ok to do nothing when node ref is not set (it will pop up when connection is back)
81+
// when node ref is set, we should probably set node MachineNodeHealthy / MachineNodeReady to unknown after some time -> May be it is ok to ignore here, because it is handled when we compute tstatus
82+
// If connection is down, ignore the error (the node ref will be read as soon as connection is up again).
83+
// Note: when connection state will change, a reconcile will be triggered automatically.
84+
//if errors.Is(err, clustercache.ErrClusterNotConnected) {
85+
// return ctrl.Result{}, nil
86+
//}
87+
7888
s.nodeGetError = err
7989
return ctrl.Result{}, err
8090
}
@@ -94,6 +104,7 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
94104
return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace)
95105
}
96106
v1beta1conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyV1Beta1Condition, clusterv1.NodeProvisioningV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Waiting for a node with matching ProviderID to exist")
107+
// FIXME: exponential slow down
97108
log.Info("Infrastructure provider reporting spec.providerID, matching Kubernetes Node is not yet available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", machine.Spec.ProviderID)
98109
// No need to requeue here. Nodes emit an event that triggers reconciliation.
99110
return ctrl.Result{}, nil
@@ -110,6 +121,7 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
110121
machine.Status.NodeRef = clusterv1.MachineNodeReference{
111122
Name: s.node.Name,
112123
}
124+
// FIXME: exponential slow down
113125
log.Info("Infrastructure provider reporting spec.providerID, Kubernetes Node is now available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", machine.Spec.ProviderID, "Node", klog.KRef("", machine.Status.NodeRef.Name))
114126
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
115127
}

internal/controllers/machine/machine_controller_phases.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ func (r *Reconciler) reconcileBootstrap(ctx context.Context, s *scope) (ctrl.Res
216216

217217
// If the data secret was not created yet, return.
218218
if !dataSecretCreated {
219+
// FIXME: exponential slow down
219220
// Only log if the Machine is a control plane Machine or the Cluster is already initialized to reduce noise.
220221
if util.IsControlPlaneMachine(m) || conditions.IsTrue(s.cluster, clusterv1.ClusterControlPlaneInitializedCondition) {
221222
log.Info(fmt.Sprintf("Waiting for bootstrap provider to generate data secret and set %s",
@@ -272,6 +273,7 @@ func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctr
272273
return ctrl.Result{}, errors.Errorf("could not find %s %s for Machine %s", m.Spec.InfrastructureRef.Kind, klog.KRef(m.Namespace, m.Spec.InfrastructureRef.Name), klog.KObj(m))
273274
}
274275
log.Info("Could not find InfrastructureMachine, requeuing", m.Spec.InfrastructureRef.Kind, klog.KRef(m.Namespace, m.Spec.InfrastructureRef.Name))
276+
// FIXME Think about dropping this requeue
275277
return ctrl.Result{RequeueAfter: externalReadyWait}, nil
276278
}
277279
return ctrl.Result{}, err
@@ -308,7 +310,9 @@ func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctr
308310

309311
// If the InfrastructureMachine is not provisioned (and it wasn't already provisioned before), return.
310312
if !provisioned && !ptr.Deref(m.Status.Initialization.InfrastructureProvisioned, false) {
311-
// Only log if the Machine is a control plane Machine or the Cluster is already initialized to reduce noise.
313+
// FIXME Exponential slow down
314+
// Only log if the bootstrap secret has been already created, the Machine is a control plane Machine or the Cluster is already initialized to reduce noise.
315+
// if ptr.Deref(m.Status.Initialization.BootstrapDataSecretCreated, false) && (util.IsControlPlaneMachine(m) || conditions.IsTrue(s.cluster, clusterv1.ClusterControlPlaneInitializedCondition)) {
312316
if util.IsControlPlaneMachine(m) || conditions.IsTrue(s.cluster, clusterv1.ClusterControlPlaneInitializedCondition) {
313317
log.Info(fmt.Sprintf("Waiting for infrastructure provider to set %s on %s",
314318
contract.InfrastructureMachine().Provisioned(contractVersion).Path().String(), s.infraMachine.Kind),
@@ -320,6 +324,7 @@ func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctr
320324
// Get providerID from the InfrastructureMachine (intentionally not setting it on the Machine yet).
321325
providerID := s.infraMachine.Spec.ProviderID
322326
if providerID == "" {
327+
// FIXME Exponential slow down
323328
log.Info(fmt.Sprintf("Waiting for infrastructure provider to set %s on %s",
324329
contract.InfrastructureMachine().ProviderID().Path().String(), s.infraMachine.Kind),
325330
s.infraMachine.Kind, klog.KObj(s.infraMachine))

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr
400400
}
401401

402402
if nextCheck > 0 {
403+
// FIXME: Log once.
403404
logger.V(2).Info("Target is likely to go unhealthy", "timeUntilUnhealthy", nextCheck.Truncate(time.Second).String())
404405
nextCheckTimes = append(nextCheckTimes, nextCheck)
405406
continue

0 commit comments

Comments
 (0)