Skip to content

Commit 1b53dbe

Browse files
authored
fix: pod restart if in-memory data-size changes [KO-424] (#379)
* fix: pod restart if in-memory data-size changes [KO-424]
1 parent 7df4a40 commit 1b53dbe

File tree

4 files changed

+137
-72
lines changed

4 files changed

+137
-72
lines changed

internal/controller/cluster/pod.go

Lines changed: 66 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,34 @@ func (r *SingleClusterReconciler) getRollingRestartTypeMap(rackState *RackState,
117117
return nil, nil, err
118118
}
119119

120-
// If EnableDynamicConfigUpdate is set and dynamic config command exec partially failed in previous try
121-
// then skip dynamic config update and fall back to rolling restart.
122-
// Continue with dynamic config update in case of Failed DynamicConfigUpdateStatus
123-
if asdbv1.GetBool(r.aeroCluster.Spec.EnableDynamicConfigUpdate) &&
124-
podStatus.DynamicConfigUpdateStatus != asdbv1.PartiallyFailed {
125-
// Fetching all dynamic config change.
126-
dynamicConfDiffPerPod[pods[idx].Name], err = r.handleDynamicConfigChange(rackState, pods[idx], version)
127-
if err != nil {
128-
return nil, nil, err
120+
specToStatusDiffs, err := getConfDiff(r.Log, rackState.Rack.AerospikeConfig.Value, pods[idx].Annotations, version)
121+
if err != nil {
122+
return nil, nil, err
123+
}
124+
125+
if len(specToStatusDiffs) != 0 {
126+
for key := range specToStatusDiffs {
127+
// To update in-memory namespace data-size, we need to restart the pod.
128+
// Just a warm restart is not enough.
129+
// https://support.aerospike.com/s/article/How-to-change-data-size-config-in-a-running-cluster
130+
if strings.HasSuffix(key, ".storage-engine.data-size") {
131+
restartTypeMap[pods[idx].Name] = mergeRestartType(restartTypeMap[pods[idx].Name], podRestart)
132+
133+
break
134+
}
135+
}
136+
137+
if restartTypeMap[pods[idx].Name] == podRestart {
138+
continue
139+
}
140+
141+
// If EnableDynamicConfigUpdate is set and dynamic config command exec partially failed in previous try
142+
// then skip dynamic config update and fall back to rolling restart.
143+
// Continue with dynamic config update in case of Failed DynamicConfigUpdateStatus
144+
if asdbv1.GetBool(r.aeroCluster.Spec.EnableDynamicConfigUpdate) &&
145+
podStatus.DynamicConfigUpdateStatus != asdbv1.PartiallyFailed &&
146+
isAllDynamicConfig(r.Log, specToStatusDiffs, version) {
147+
dynamicConfDiffPerPod[pods[idx].Name] = specToStatusDiffs
129148
}
130149
}
131150
}
@@ -1358,16 +1377,46 @@ func (r *SingleClusterReconciler) getConfigMap(rackID int) (*corev1.ConfigMap, e
13581377
return confMap, nil
13591378
}
13601379

1361-
func (r *SingleClusterReconciler) handleDynamicConfigChange(rackState *RackState, pod *corev1.Pod, version string) (
1362-
asconfig.DynamicConfigMap, error) {
1363-
statusFromAnnotation := pod.Annotations["aerospikeConf"]
1380+
// isAllDynamicConfig checks if all the configuration changes can be applied dynamically
1381+
func isAllDynamicConfig(log logger, specToStatusDiffs asconfig.DynamicConfigMap, version string) bool {
1382+
isDynamic, err := asconfig.IsAllDynamicConfig(log, specToStatusDiffs, version)
1383+
if err != nil {
1384+
log.Info("Failed to check if all config is dynamic, fallback to rolling restart", "error", err.Error())
1385+
return false
1386+
}
1387+
1388+
if !isDynamic {
1389+
log.Info("Config contains static field changes; dynamic update not possible.")
1390+
return false
1391+
}
1392+
1393+
return true
1394+
}
13641395

1365-
asConfStatus, err := getFlatConfig(r.Log, statusFromAnnotation)
1396+
func getFlatConfig(log logger, confStr string) (*asconfig.Conf, error) {
1397+
asConf, err := asconfig.NewASConfigFromBytes(log, []byte(confStr), asconfig.AeroConfig)
13661398
if err != nil {
13671399
return nil, fmt.Errorf("failed to load config map by lib: %v", err)
13681400
}
13691401

1370-
asConf, err := asconfig.NewMapAsConfig(r.Log, rackState.Rack.AerospikeConfig.Value)
1402+
return asConf.GetFlatMap(), nil
1403+
}
1404+
1405+
// getConfDiff retrieves the configuration differences between the spec and status Aerospike configurations.
1406+
func getConfDiff(log logger, specConfig map[string]interface{}, podAnnotations map[string]string,
1407+
version string) (asconfig.DynamicConfigMap, error) {
1408+
statusFromAnnotation, ok := podAnnotations["aerospikeConf"]
1409+
if !ok {
1410+
log.Info("Pod annotation 'aerospikeConf' missing")
1411+
return nil, nil
1412+
}
1413+
1414+
asConfStatus, err := getFlatConfig(log, statusFromAnnotation)
1415+
if err != nil {
1416+
return nil, fmt.Errorf("failed to load config map by lib: %v", err)
1417+
}
1418+
1419+
asConf, err := asconfig.NewMapAsConfig(log, specConfig)
13711420
if err != nil {
13721421
return nil, fmt.Errorf("failed to load config map by lib: %v", err)
13731422
}
@@ -1377,44 +1426,22 @@ func (r *SingleClusterReconciler) handleDynamicConfigChange(rackState *RackState
13771426
specConfFile = strings.ReplaceAll(specConfFile, "$${_DNE}{un}", "${un}")
13781427
specConfFile = strings.ReplaceAll(specConfFile, "$${_DNE}{dn}", "${dn}")
13791428

1380-
asConfSpec, err := getFlatConfig(r.Log, specConfFile)
1429+
asConfSpec, err := getFlatConfig(log, specConfFile)
13811430
if err != nil {
13821431
return nil, fmt.Errorf("failed to load config map by lib: %v", err)
13831432
}
13841433

1385-
specToStatusDiffs, err := asconfig.ConfDiff(r.Log, *asConfSpec, *asConfStatus,
1434+
specToStatusDiffs, err := asconfig.ConfDiff(log, *asConfSpec, *asConfStatus,
13861435
true, version)
13871436
if err != nil {
1388-
r.Log.Info("Failed to get config diff to change config dynamically, fallback to rolling restart",
1437+
log.Info("Failed to get config diff to change config dynamically, fallback to rolling restart",
13891438
"error", err.Error())
13901439
return nil, nil
13911440
}
13921441

1393-
if len(specToStatusDiffs) > 0 {
1394-
isDynamic, err := asconfig.IsAllDynamicConfig(r.Log, specToStatusDiffs, version)
1395-
if err != nil {
1396-
r.Log.Info("Failed to check if all config is dynamic, fallback to rolling restart", "error", err.Error())
1397-
return nil, nil
1398-
}
1399-
1400-
if !isDynamic {
1401-
r.Log.Info("Static field has been modified, cannot change config dynamically")
1402-
return nil, nil
1403-
}
1404-
}
1405-
14061442
return specToStatusDiffs, nil
14071443
}
14081444

1409-
func getFlatConfig(log logger, confStr string) (*asconfig.Conf, error) {
1410-
asConf, err := asconfig.NewASConfigFromBytes(log, []byte(confStr), asconfig.AeroConfig)
1411-
if err != nil {
1412-
return nil, fmt.Errorf("failed to load config map by lib: %v", err)
1413-
}
1414-
1415-
return asConf.GetFlatMap(), nil
1416-
}
1417-
14181445
func (r *SingleClusterReconciler) patchPodStatus(ctx context.Context, patches []jsonpatch.PatchOperation) error {
14191446
if len(patches) == 0 {
14201447
return nil

test/cleanup-test-namespace.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ for namespace in $namespaces; do
2323
echo "Removing Aerospike backup from namespace: $namespace"
2424
kubectl -n "$namespace" delete aerospikebackup --all
2525

26-
echo "Removing Aerospike backup service from namespace: $namespace"
27-
kubectl -n "$namespace" delete aerospikebackupservice --all
26+
echo "Removing Aerospike backup service from namespace: $namespace"
27+
kubectl -n "$namespace" delete aerospikebackupservice --all
2828

2929
# Force delete pods
3030
kubectl -n "$namespace" delete pod --selector 'app=aerospike-cluster' --grace-period=0 --force --ignore-not-found

test/cluster/dynamic_config_test.go

Lines changed: 67 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@ type podID struct {
3232
asdPID string
3333
}
3434

35-
const clName = "dynamic-config-test"
35+
const (
36+
clName = "dynamic-config-test"
37+
noRestart = "noRestart"
38+
)
3639

3740
var (
3841
configWithMaxDefaultVal = mapset.NewSet("info-max-ms", "flush-max-ms")
@@ -72,6 +75,8 @@ var _ = Describe(
7275
aeroCluster := createDummyAerospikeCluster(
7376
clusterNamespacedName, 2,
7477
)
78+
aeroCluster.Spec.AerospikeConfig.Value["namespaces"] = append(
79+
aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{}), getNonSCInMemoryNamespaceConfig("mem"))
7580
aeroCluster.Spec.AerospikeConfig.Value["xdr"] = map[string]interface{}{
7681
"dcs": []map[string]interface{}{
7782
{
@@ -169,7 +174,7 @@ var _ = Describe(
169174

170175
By("Verify no warm/cold restarts in Pods")
171176

172-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
177+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
173178

174179
By("Modify dynamic config by removing fields")
175180

@@ -215,14 +220,14 @@ var _ = Describe(
215220

216221
By("Verify no warm/cold restarts in Pods")
217222

218-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
223+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
219224
},
220225
)
221226

222227
It(
223228
"Should update config statically", func() {
224229

225-
By("Modify static config")
230+
By("Modify static config to do warm restart")
226231

227232
aeroCluster, err := getCluster(
228233
k8sClient, ctx, clusterNamespacedName,
@@ -251,8 +256,41 @@ var _ = Describe(
251256
Expect(enableQuotas).To(BeTrue())
252257

253258
By("Verify warm restarts in Pods")
259+
validateServerRestart(ctx, aeroCluster, podPIDMap, asdbv1.OperationWarmRestart)
260+
261+
By("Modify static config to do pod restart")
262+
263+
aeroCluster, err = getCluster(
264+
k8sClient, ctx, clusterNamespacedName,
265+
)
266+
Expect(err).ToNot(HaveOccurred())
267+
268+
podPIDMap, err = getPodIDs(ctx, aeroCluster)
269+
Expect(err).ToNot(HaveOccurred())
270+
271+
nsList := aeroCluster.Spec.AerospikeConfig.Value["namespaces"].([]interface{})
272+
nsList[1].(map[string]interface{})["storage-engine"].(map[string]interface{})["data-size"] = 2073741824
273+
274+
err = updateCluster(k8sClient, ctx, aeroCluster)
275+
Expect(err).ToNot(HaveOccurred())
276+
277+
By("Fetch and verify static configs")
254278

255-
validateServerRestart(ctx, aeroCluster, podPIDMap, true)
279+
pod = aeroCluster.Status.Pods[aeroCluster.Name+"-0-0"]
280+
281+
info, err := requestInfoFromNode(logger, k8sClient, ctx, clusterNamespacedName, "namespace/mem", &pod)
282+
Expect(err).ToNot(HaveOccurred())
283+
284+
confs := strings.Split(info["namespace/mem"], ";")
285+
for _, conf := range confs {
286+
if strings.Contains(conf, "data-size") {
287+
keyValue := strings.Split(conf, "=")
288+
Expect(keyValue[1]).To(Equal("2073741824"))
289+
}
290+
}
291+
292+
By("Verify pod restarts in Pods")
293+
validateServerRestart(ctx, aeroCluster, podPIDMap, asdbv1.OperationPodRestart)
256294
},
257295
)
258296

@@ -290,7 +328,7 @@ var _ = Describe(
290328
Expect(cv).To(Equal(int64(19000)))
291329

292330
By("Verify warm restarts in Pods")
293-
validateServerRestart(ctx, aeroCluster, podPIDMap, true)
331+
validateServerRestart(ctx, aeroCluster, podPIDMap, asdbv1.OperationWarmRestart)
294332
},
295333
)
296334
},
@@ -357,8 +395,8 @@ var _ = Describe(
357395

358396
// As there was no full config update failure,
359397
// expectation is that pods will not be restarted.
360-
By("Verify cold restarts in Pods")
361-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
398+
By("Verify no restarts in Pods")
399+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
362400
},
363401
)
364402

@@ -409,8 +447,14 @@ var _ = Describe(
409447
Expect(err).ToNot(HaveOccurred())
410448

411449
// As pods were failed, expectation is that pods will be cold restarted.
412-
By("Verify cold restarts in Pods")
413-
validateServerRestart(ctx, aeroCluster, podPIDMap, true)
450+
By("Verify partial cold restart in Pods")
451+
operationTypeMap := map[string]asdbv1.OperationKind{
452+
aeroCluster.Name + "-0-0": noRestart,
453+
aeroCluster.Name + "-0-1": asdbv1.OperationPodRestart,
454+
}
455+
456+
err = validateOperationTypes(ctx, aeroCluster, podPIDMap, operationTypeMap)
457+
Expect(err).ToNot(HaveOccurred())
414458

415459
},
416460
)
@@ -524,7 +568,7 @@ var _ = Describe(
524568
Expect(err).ToNot(HaveOccurred())
525569

526570
By("Verify no warm/cold restarts in Pods")
527-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
571+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
528572

529573
By("Verify Network Context configs dynamically")
530574
err = validateNetworkContextDynamically(ctx, flatServer, flatSpec, aeroCluster, dynamic)
@@ -536,7 +580,7 @@ var _ = Describe(
536580
Expect(err).ToNot(HaveOccurred())
537581

538582
By("Verify no warm/cold restarts in Pods")
539-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
583+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
540584

541585
By("Verify Namespace Context configs dynamically")
542586
err = validateNamespaceContextDynamically(ctx, flatServer, flatSpec, aeroCluster, dynamic)
@@ -548,7 +592,7 @@ var _ = Describe(
548592
Expect(err).ToNot(HaveOccurred())
549593

550594
By("Verify no warm/cold restarts in Pods")
551-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
595+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
552596

553597
By("Verify Security Context configs dynamically")
554598
err = validateSecurityContextDynamically(ctx, flatServer, flatSpec, aeroCluster, dynamic)
@@ -560,7 +604,7 @@ var _ = Describe(
560604
Expect(err).ToNot(HaveOccurred())
561605

562606
By("Verify no warm/cold restarts in Pods")
563-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
607+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
564608

565609
By("Verify XDR Context configs dynamically")
566610
err = validateXDRContextDynamically(clusterNamespacedName,
@@ -573,7 +617,7 @@ var _ = Describe(
573617
Expect(err).ToNot(HaveOccurred())
574618

575619
By("Verify no warm/cold restarts in Pods")
576-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
620+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
577621
},
578622
)
579623
},
@@ -671,30 +715,24 @@ var _ = Describe(
671715
}
672716

673717
By("Verify no warm/cold restarts in Pods")
674-
675-
validateServerRestart(ctx, aeroCluster, podPIDMap, false)
718+
validateServerRestart(ctx, aeroCluster, podPIDMap, noRestart)
676719
},
677720
)
678721
},
679722
)
680723
},
681724
)
682725

683-
func validateServerRestart(ctx goctx.Context, cluster *asdbv1.AerospikeCluster, pidMap map[string]podID,
684-
shouldRestart bool) {
685-
restarted := false
726+
func validateServerRestart(ctx goctx.Context, aeroCluster *asdbv1.AerospikeCluster, pidMap map[string]podID,
727+
restartType asdbv1.OperationKind) {
728+
operationTypeMap := make(map[string]asdbv1.OperationKind)
686729

687-
newPodPidMap, err := getPodIDs(ctx, cluster)
688-
Expect(err).ToNot(HaveOccurred())
689-
690-
for podName, pid := range pidMap {
691-
if newPodPidMap[podName].podUID != pid.podUID || newPodPidMap[podName].asdPID != pid.asdPID {
692-
restarted = true
693-
break
694-
}
730+
for podName := range pidMap {
731+
operationTypeMap[podName] = restartType
695732
}
696733

697-
Expect(restarted).To(Equal(shouldRestart))
734+
err := validateOperationTypes(ctx, aeroCluster, pidMap, operationTypeMap)
735+
Expect(err).ToNot(HaveOccurred())
698736
}
699737

700738
func getPodIDs(ctx context.Context, aeroCluster *asdbv1.AerospikeCluster) (map[string]podID, error) {

test/cluster/on_demand_operations_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ var _ = Describe(
202202

203203
operationTypeMap := map[string]asdbv1.OperationKind{
204204
aeroCluster.Name + "-1-0": asdbv1.OperationPodRestart,
205-
aeroCluster.Name + "-1-1": "noRestart",
205+
aeroCluster.Name + "-1-1": noRestart,
206206
}
207207

208208
err = validateOperationTypes(ctx, aeroCluster, oldPodIDs, operationTypeMap)
@@ -465,7 +465,7 @@ func validateOperationTypes(ctx goctx.Context, aeroCluster *asdbv1.AerospikeClus
465465
if newPodPidMap[podName].podUID == pid[podName].podUID {
466466
return fmt.Errorf("failed to restart pod %s", podName)
467467
}
468-
case "noRestart":
468+
case noRestart:
469469
if newPodPidMap[podName].podUID != pid[podName].podUID || newPodPidMap[podName].asdPID != pid[podName].asdPID {
470470
return fmt.Errorf("unexpected restart pod %s", podName)
471471
}

0 commit comments

Comments
 (0)