Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions charts/postgres-operator/crds/postgresqls.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,23 @@ spec:
- PreferNoSchedule
tolerationSeconds:
type: integer
topologySpreadConstraints:
type: array
nullable: true
items:
type: object
properties:
maxskew:
type: integer
format: int32
minimum: 1
topologyKey:
type: string
whenUnsatisfiable:
type: string
enum:
- DoNotSchedule
- ScheduleAnyway
useLoadBalancer:
type: boolean
description: deprecated
Expand Down
21 changes: 19 additions & 2 deletions docs/user.md
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ but Kubernetes will not spin up the pod if the requested HugePages cannot be all
For more information on HugePages in Kubernetes, see also
[https://kubernetes.io/docs/tasks/manage-hugepages/scheduling-hugepages/](https://kubernetes.io/docs/tasks/manage-hugepages/scheduling-hugepages/)

## Use taints, tolerations and node affinity for dedicated PostgreSQL nodes
## Use taints, tolerations, node affinity and topology spread constraint for dedicated PostgreSQL nodes

To ensure Postgres pods are running on nodes without any other application pods,
you can use [taints and tolerations](https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/)
Expand Down Expand Up @@ -755,6 +755,23 @@ spec:
If you need to define a `nodeAffinity` for all your Postgres clusters use the
`node_readiness_label` [configuration](administrator.md#node-readiness-labels).

If you need PostgreSQL Pods to run on separate nodes, you can use the
[topologySpreadConstraints](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/) to control how they are distributed across your cluster.
This ensures they are spread among failure domains such as
regions, zones, nodes, or other user-defined topology domains.

```yaml
apiVersion: "acid.zalan.do/v1"
kind: postgresql
metadata:
name: acid-minimal-cluster
spec:
topologySpreadConstraints:
- maxskew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
```

## In-place major version upgrade

Starting with Spilo 13, operator supports in-place major version upgrade to a
Expand Down Expand Up @@ -1053,7 +1070,7 @@ spec:
- all
volumeSource:
emptyDir: {}
sidecars:
sidecars:
- name: "container-name"
image: "company/image:tag"
volumeMounts:
Expand Down
95 changes: 83 additions & 12 deletions e2e/tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ def compare_config():

pg_patch_config["spec"]["patroni"]["slots"][slot_to_change]["database"] = "bar"
del pg_patch_config["spec"]["patroni"]["slots"][slot_to_remove]

k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_delete_slot_patch)

Expand All @@ -576,7 +576,7 @@ def compare_config():

self.eventuallyEqual(lambda: self.query_database(leader.metadata.name, "postgres", get_slot_query%("database", slot_to_change))[0], "bar",
"The replication slot cannot be updated", 10, 5)

# make sure slot from Patroni didn't get deleted
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "postgres", get_slot_query%("slot_name", patroni_slot))), 1,
"The replication slot from Patroni gets deleted", 10, 5)
Expand Down Expand Up @@ -932,7 +932,7 @@ def test_ignored_annotations(self):
},
}
}

old_sts_creation_timestamp = sts.metadata.creation_timestamp
k8s.api.apps_v1.patch_namespaced_stateful_set(sts.metadata.name, sts.metadata.namespace, annotation_patch)
old_svc_creation_timestamp = svc.metadata.creation_timestamp
Expand Down Expand Up @@ -1369,7 +1369,7 @@ def test_persistent_volume_claim_retention_policy(self):
}
k8s.update_config(patch_scaled_policy_retain)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

# decrease the number of instances
k8s.api.custom_objects_api.patch_namespaced_custom_object(
'acid.zalan.do', 'v1', 'default', 'postgresqls', 'acid-minimal-cluster', pg_patch_scale_down_instances)
Expand Down Expand Up @@ -1646,7 +1646,6 @@ def test_node_readiness_label(self):
# toggle pod anti affinity to move replica away from master node
self.assert_distributed_pods(master_nodes)


@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_overwrite_pooler_deployment(self):
pooler_name = 'acid-minimal-cluster-pooler'
Expand Down Expand Up @@ -1799,7 +1798,7 @@ def test_password_rotation(self):
},
}
k8s.api.core_v1.patch_namespaced_secret(
name="foo-user.acid-minimal-cluster.credentials.postgresql.acid.zalan.do",
name="foo-user.acid-minimal-cluster.credentials.postgresql.acid.zalan.do",
namespace="default",
body=secret_fake_rotation)

Expand All @@ -1816,7 +1815,7 @@ def test_password_rotation(self):
"enable_password_rotation": "true",
"inherited_annotations": "environment",
"password_rotation_interval": "30",
"password_rotation_user_retention": "30", # should be set to 60
"password_rotation_user_retention": "30", # should be set to 60
},
}
k8s.update_config(enable_password_rotation)
Expand Down Expand Up @@ -1885,7 +1884,7 @@ def test_password_rotation(self):
self.assertTrue("environment" in db_user_secret.metadata.annotations, "Added annotation was not propagated to secret")

# disable password rotation for all other users (foo_user)
# and pick smaller intervals to see if the third fake rotation user is dropped
# and pick smaller intervals to see if the third fake rotation user is dropped
enable_password_rotation = {
"data": {
"enable_password_rotation": "false",
Expand Down Expand Up @@ -2385,6 +2384,78 @@ def test_taint_based_eviction(self):
# toggle pod anti affinity to move replica away from master node
self.assert_distributed_pods(master_nodes)

@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_topology_spread_constraints(self):
'''
Enable topologySpreadConstraints for pods
'''
k8s = self.k8s
cluster_labels = "application=spilo,cluster-name=acid-minimal-cluster"

# Verify we are in good state from potential previous tests
self.eventuallyEqual(lambda: k8s.count_running_pods(), 2, "No 2 pods running")

master_nodes, replica_nodes = k8s.get_cluster_nodes()
self.assertNotEqual(master_nodes, [])
self.assertNotEqual(replica_nodes, [])

# Patch label to nodes for topologySpreadConstraints
patch_node_label = {
"metadata": {
"labels": {
"topology.kubernetes.io/zone": "zalando"
}
}
}
k8s.api.core_v1.patch_node(master_nodes[0], patch_node_label)
k8s.api.core_v1.patch_node(replica_nodes[0], patch_node_label)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect that the e2e test patches the Postgresql manifest and adds topologySpreadConstraints to then check if the pods spread evenly. But you're only patching the nodes here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the e2e test.


# Patch topologySpreadConstraint and scale-out postgresql pods to postgresqls manifest.
patch_topologySpreadConstraint_config = {
"spec": {
"numberOfInstances": 6,
"topologySpreadConstraint": [
{
"maxskew": 1,
"topologyKey": "topology.kubernetes.io/zone",
"whenUnsatisfiable": "DoNotSchedule"
}
]
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default",
"postgresqls", "acid-minimal-cluster",
patch_topologySpreadConstraint_config)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
self.eventuallyEqual(lambda: k8s.count_pods_with_label(cluster_labels), 6, "Postgresql StatefulSet are scale to 6")
self.eventuallyEqual(lambda: k8s.count_running_pods(), 6, "All pods are running")

worker_node_1 = 0
worker_node_2 = 0
pods = k8s.api.core_v1.list_namespaced_pod('default', label_selector=cluster_labels)
for pod in pods.items:
if pod.spec.node_name == 'postgres-operator-e2e-tests-worker':
worker_node_1 += 1
elif pod.spec.node_name == 'postgres-operator-e2e-tests-worker2':
worker_node_2 += 1

self.assertEqual(worker_node_1, worker_node_2)
self.assertEqual(worker_node_1, 3)
self.assertEqual(worker_node_2, 3)

# Reset configurations
patch_topologySpreadConstraint_config = {
"spec": {
"numberOfInstances": 2,
"topologySpreadConstraint": []
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default",
"postgresqls", "acid-minimal-cluster",
patch_topologySpreadConstraint_config)

@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_zz_cluster_deletion(self):
'''
Expand Down Expand Up @@ -2460,7 +2531,7 @@ def test_zz_cluster_deletion(self):
self.eventuallyEqual(lambda: k8s.count_deployments_with_label(cluster_label), 0, "Deployments not deleted")
self.eventuallyEqual(lambda: k8s.count_pdbs_with_label(cluster_label), 0, "Pod disruption budget not deleted")
self.eventuallyEqual(lambda: k8s.count_secrets_with_label(cluster_label), 8, "Secrets were deleted although disabled in config")
self.eventuallyEqual(lambda: k8s.count_pvcs_with_label(cluster_label), 3, "PVCs were deleted although disabled in config")
self.eventuallyEqual(lambda: k8s.count_pvcs_with_label(cluster_label), 6, "PVCs were deleted although disabled in config")

except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
Expand Down Expand Up @@ -2502,7 +2573,7 @@ def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=aci

# if nodes are different we can quit here
if master_nodes[0] not in replica_nodes:
return True
return True

# enable pod anti affintiy in config map which should trigger movement of replica
patch_enable_antiaffinity = {
Expand All @@ -2526,7 +2597,7 @@ def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=aci
}
k8s.update_config(patch_disable_antiaffinity, "disable antiaffinity")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

k8s.wait_for_pod_start('spilo-role=replica,' + cluster_labels)
k8s.wait_for_running_pods(cluster_labels, 2)

Expand All @@ -2537,7 +2608,7 @@ def assert_distributed_pods(self, target_nodes, cluster_labels='cluster-name=aci
# if nodes are different we can quit here
for target_node in target_nodes:
if (target_node not in master_nodes or target_node not in replica_nodes) and master_nodes[0] in replica_nodes:
print('Pods run on the same node')
print('Pods run on the same node')
return False

except timeout_decorator.TimeoutError:
Expand Down
6 changes: 6 additions & 0 deletions manifests/complete-postgres-manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ spec:
# values:
# - enabled

# Add topology spread constraint to distribute PostgreSQL pods across all nodes labeled with "topology.kubernetes.io/zone".
# topologySpreadConstraint:
# - maxSkew: 1
# topologyKey: topology.kubernetes.io/zone
# whenUnsatisfiable: DoNotSchedule

# Enables change data capture streams for defined database tables
# streams:
# - applicationId: test-app
Expand Down
16 changes: 16 additions & 0 deletions manifests/postgresql.crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,22 @@ spec:
- PreferNoSchedule
tolerationSeconds:
type: integer
topologySpreadConstraints:
type: array
nullable: true
items:
type: object
properties:
maxSkew:
type: integer
format: int32
topologyKey:
type: string
whenUnsatisfiable:
type: string
enum:
- DoNotSchedule
- ScheduleAnyway
useLoadBalancer:
type: boolean
description: deprecated
Expand Down
29 changes: 29 additions & 0 deletions pkg/apis/acid.zalan.do/v1/crds.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ var OperatorConfigCRDResourceColumns = []apiextv1.CustomResourceColumnDefinition

var min0 = 0.0
var min1 = 1.0
var minLength1 int64 = 1
var minDisable = -1.0

// PostgresCRDResourceValidation to check applied manifest parameters
Expand Down Expand Up @@ -895,6 +896,34 @@ var PostgresCRDResourceValidation = apiextv1.CustomResourceValidation{
},
},
},
"topologySpreadConstraints": {
Type: "array",
Nullable: true,
Items: &apiextv1.JSONSchemaPropsOrArray{
Schema: &apiextv1.JSONSchemaProps{
Type: "object",
Properties: map[string]apiextv1.JSONSchemaProps{
"maxSkew": {
Type: "integer",
Format: "int32",
Minimum: &min1,
},
"topologyKey": {
Type: "string",
MinLength: &minLength1,
},
"whenUnsatisfiable": {
Type: "string",
Enum: []apiextv1.JSON{
{Raw: []byte(`"DoNotSchedule"`)},
{Raw: []byte(`"ScheduleAnyway"`)},
},
},
},
Required: []string{"maxSkew", "topologyKey", "whenUnsatisfiable"},
},
},
},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here with the XPreserveUnknownFields. Yes there are other fields where are doing it like this, but lets get it right for new additions. I know, it's tedious to reflect the full schema because we don use a framework like kubeBulder. But it should be the trade-off for contributors when they go the "easy way" with allowing full specs in our manifest over custom stripped-down designs better suitable for end users.

"useLoadBalancer": {
Type: "boolean",
Description: "deprecated",
Expand Down
37 changes: 19 additions & 18 deletions pkg/apis/acid.zalan.do/v1/postgresql_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,24 +63,25 @@ type PostgresSpec struct {
UsersWithSecretRotation []string `json:"usersWithSecretRotation,omitempty"`
UsersWithInPlaceSecretRotation []string `json:"usersWithInPlaceSecretRotation,omitempty"`

NumberOfInstances int32 `json:"numberOfInstances"`
MaintenanceWindows []MaintenanceWindow `json:"maintenanceWindows,omitempty"`
Clone *CloneDescription `json:"clone,omitempty"`
Databases map[string]string `json:"databases,omitempty"`
PreparedDatabases map[string]PreparedDatabase `json:"preparedDatabases,omitempty"`
SchedulerName *string `json:"schedulerName,omitempty"`
NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"`
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
Sidecars []Sidecar `json:"sidecars,omitempty"`
InitContainers []v1.Container `json:"initContainers,omitempty"`
PodPriorityClassName string `json:"podPriorityClassName,omitempty"`
ShmVolume *bool `json:"enableShmVolume,omitempty"`
EnableLogicalBackup bool `json:"enableLogicalBackup,omitempty"`
LogicalBackupRetention string `json:"logicalBackupRetention,omitempty"`
LogicalBackupSchedule string `json:"logicalBackupSchedule,omitempty"`
StandbyCluster *StandbyDescription `json:"standby,omitempty"`
PodAnnotations map[string]string `json:"podAnnotations,omitempty"`
ServiceAnnotations map[string]string `json:"serviceAnnotations,omitempty"`
NumberOfInstances int32 `json:"numberOfInstances"`
MaintenanceWindows []MaintenanceWindow `json:"maintenanceWindows,omitempty"`
Clone *CloneDescription `json:"clone,omitempty"`
Databases map[string]string `json:"databases,omitempty"`
PreparedDatabases map[string]PreparedDatabase `json:"preparedDatabases,omitempty"`
SchedulerName *string `json:"schedulerName,omitempty"`
NodeAffinity *v1.NodeAffinity `json:"nodeAffinity,omitempty"`
TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
Sidecars []Sidecar `json:"sidecars,omitempty"`
InitContainers []v1.Container `json:"initContainers,omitempty"`
PodPriorityClassName string `json:"podPriorityClassName,omitempty"`
ShmVolume *bool `json:"enableShmVolume,omitempty"`
EnableLogicalBackup bool `json:"enableLogicalBackup,omitempty"`
LogicalBackupRetention string `json:"logicalBackupRetention,omitempty"`
LogicalBackupSchedule string `json:"logicalBackupSchedule,omitempty"`
StandbyCluster *StandbyDescription `json:"standby,omitempty"`
PodAnnotations map[string]string `json:"podAnnotations,omitempty"`
ServiceAnnotations map[string]string `json:"serviceAnnotations,omitempty"`
// MasterServiceAnnotations takes precedence over ServiceAnnotations for master role if not empty
MasterServiceAnnotations map[string]string `json:"masterServiceAnnotations,omitempty"`
// ReplicaServiceAnnotations takes precedence over ServiceAnnotations for replica role if not empty
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/acid.zalan.do/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading