35
35
key_vault_name = os .getenv ("AZURE_KEY_VAULT_NAME" )
36
36
key_vault_uri = f"https://{ key_vault_name } .vault.azure.net/"
37
37
use_namespaces = os .getenv ("USE_NAMESPACES" , "false" ).lower () in ("true" , "1" , "yes" , "enabled" )
38
- check_interval = int (os .getenv ("CHECK_INTERVAL" , 300 ))
38
+ check_interval = int (os .getenv ("CHECK_INTERVAL" , " 300" ))
39
39
filter_annotation = os .getenv ("ANNOTATION" , "cert-manager.io/certificate-name" )
40
40
certificate_name_filter = os .getenv ("CERT_NAME_FILTER" , "*" )
41
41
42
42
# GitHub version check variables
43
43
github_repository_owner = os .getenv ("GITHUB_REPO_OWNER" , "rdvansloten" )
44
44
github_repository_name = os .getenv ("GITHUB_REPO_NAME" , "cert-manager-key-vault-sync" )
45
45
version_check_interval = os .getenv ("VERSION_CHECK_INTERVAL" , "86400" )
46
- current_version = "v1.2 .0"
46
+ current_version = "v1.3 .0"
47
47
check_version = os .getenv ("CHECK_VERSION" , "true" ).lower ()
48
48
49
49
# Leader election variables
50
50
lease_name = os .getenv ("LEADER_ELECTION_LEASE_NAME" , "cert-manager-key-vault-sync-leader" )
51
51
lease_namespace = os .getenv ("POD_NAMESPACE" , "cert-manager-key-vault-sync" )
52
- lease_duration_seconds = int (os .getenv ("LEASE_DURATION_SECONDS" , 60 ))
53
- renew_interval_seconds = int (os .getenv ("RENEW_INTERVAL_SECONDS" , 60 ))
52
+ lease_duration_seconds = int (os .getenv ("LEASE_DURATION_SECONDS" , "60" ))
53
+ renew_interval_seconds = int (os .getenv ("RENEW_INTERVAL_SECONDS" , "60" ))
54
54
pod_name = os .getenv ("POD_NAME" , "unknown" )
55
55
leader_active = True
56
56
59
59
logging .info (f"Using Key Vault: { key_vault_uri } " )
60
60
logging .info (f"Using Namespace separation: { str (use_namespaces ).lower ()} " )
61
61
logging .info (f"Using certificate name filter: { certificate_name_filter } " )
62
- logging .info (f "Using annotation filter: { filter_annotation } " )
62
+ logging .info ("Using annotation filter: %s" , filter_annotation )
63
63
logging .info (f"Using version check interval: { version_check_interval } " )
64
64
logging .info (f"Using GitHub version check: { check_version } " )
65
65
66
- # Initialize Key Vault client
67
- try :
68
- credential = DefaultAzureCredential (exclude_interactive_browser_credential = False , additionally_allowed_tenants = "*" )
69
- certificate_client = CertificateClient (vault_url = key_vault_uri , credential = credential )
66
+ # Initialize Kubernetes client (in-cluster config)
67
+ config .load_incluster_config ()
68
+ k8s_client = client .CoreV1Api ()
70
69
71
- logging . info ( "Detected Key Vault Certificates:" )
72
- for cert in certificate_client . list_properties_of_certificates ():
73
- logging . info ( f"- { cert . name } " )
70
+ # Azure credential and Key Vault client will be initialized after leadership is acquired
71
+ credential = None
72
+ certificate_client = None
74
73
75
- logging .info (f"Initialized Azure Key Vault client using Key Vault '{ key_vault_name } '." )
74
+ def init_key_vault_client ():
75
+ '''Initialize the Azure Key Vault client using DefaultAzureCredential.'''
76
+ global credential , certificate_client
77
+ # Lazy initialization if not already done
78
+ if certificate_client is not None :
79
+ return
76
80
77
- except ResourceNotFoundError as e :
78
- logging .error (f"Failed to connect to Key Vault '{ key_vault_name } ': { str (e )} " )
79
- raise
81
+ credential = DefaultAzureCredential (exclude_interactive_browser_credential = False , additionally_allowed_tenants = "*" )
82
+ certificate_client = CertificateClient (vault_url = key_vault_uri , credential = credential )
80
83
81
- except ServiceRequestError as e :
82
- logging .error (f"Failed to connect to Key Vault '{ key_vault_name } ': { str (e )} " )
83
- raise
84
+ try :
85
+ logging .info ("Detected Key Vault Certificates:" )
86
+ for cert in certificate_client .list_properties_of_certificates ():
87
+ logging .info (cert .name )
84
88
85
- except Exception as e :
86
- logging .error (f"Failed to connect to Key Vault '{ key_vault_name } ': { str (e )} " )
87
- raise
89
+ logging .info (f"Initialized Azure Key Vault client using Key Vault '{ key_vault_name } '." )
88
90
89
- # Initialize Kubernetes client (in-cluster config)
90
- config . load_incluster_config ( )
91
- k8s_client = client . CoreV1Api ()
91
+ except ResourceNotFoundError as e :
92
+ logging . error ( f"Failed to connect to Key Vault ' { key_vault_name } ': { str ( e ) } " )
93
+ raise
92
94
95
+ except ServiceRequestError as e :
96
+ logging .error (f"Failed to connect to Key Vault '{ key_vault_name } ': { str (e )} " )
97
+ raise
98
+
99
+ except Exception as e :
100
+ logging .error (f"Failed to connect to Key Vault '{ key_vault_name } ': { str (e )} " )
101
+ raise
93
102
94
103
# Leader election functions
95
104
def get_lease (api ):
@@ -114,24 +123,23 @@ def create_lease(api):
114
123
)
115
124
try :
116
125
created = api .create_namespaced_lease (lease_namespace , lease )
117
- logging .info (f"[ { pod_name } ] Created lease; acquired leadership." )
126
+ logging .info (f"Pod { pod_name } created lease; acquired leadership." )
118
127
return created
119
128
except client .exceptions .ApiException as e :
120
- logging .error (f"[ { pod_name } ] Error creating lease: { e } " )
129
+ logging .error (f"Pod { pod_name } could not create a lease: { e } " )
121
130
return None
122
131
123
-
124
132
def try_acquire_leadership (api ):
125
133
now = datetime .datetime .now (datetime .timezone .utc )
134
+ # Fetch current Lease (or None if it doesn’t exist)
126
135
lease = get_lease (api )
127
136
if lease is None :
137
+ # Try to create it (first comers win)
128
138
lease = create_lease (api )
129
- if lease is not None :
130
- return True
131
- else :
132
- return False
139
+ return True if lease is not None else False
133
140
134
141
spec = lease .spec
142
+ # Determine if the existing lease has expired
135
143
if spec .renew_time is None :
136
144
expired = True
137
145
else :
@@ -140,23 +148,32 @@ def try_acquire_leadership(api):
140
148
last_renew = datetime .datetime .fromisoformat (last_renew .replace ("Z" , "+00:00" ))
141
149
expired = (now - last_renew ).total_seconds () > spec .lease_duration_seconds
142
150
151
+ # If we already hold it or it’s expired, try to take it
143
152
if spec .holder_identity == pod_name or expired :
144
153
lease .spec .holder_identity = pod_name
145
154
lease .spec .acquire_time = now
146
155
lease .spec .renew_time = now
147
156
lease .spec .lease_duration_seconds = lease_duration_seconds
157
+
148
158
try :
149
159
api .replace_namespaced_lease (lease_name , lease_namespace , lease )
150
- logging .info (f"[ { pod_name } ] Acquired /renewed leadership." )
160
+ logging .info (f"Pod { pod_name } acquired /renewed leadership." )
151
161
return True
162
+
152
163
except client .exceptions .ApiException as e :
153
- logging .error (f"[{ pod_name } ] Failed to update lease: { e } " )
154
- return False
164
+ if e .status == 409 :
165
+ # Another pod updated the lease first—just back off
166
+ logging .debug (f"Pod { pod_name } had a lease update conflict; leadership held elsewhere." )
167
+ return False
168
+ else :
169
+ logging .error (f"Pod { pod_name } has failed to update lease: { e } " )
170
+ return False
171
+
155
172
else :
156
- logging .debug (f"[{ pod_name } ] Leadership held by { spec .holder_identity } ." )
173
+ # Someone else still holds a valid lease
174
+ logging .debug (f"Leadership held by { spec .holder_identity } ." )
157
175
return False
158
176
159
-
160
177
def renew_leadership (api ):
161
178
global leader_active
162
179
while leader_active :
@@ -165,18 +182,18 @@ def renew_leadership(api):
165
182
try :
166
183
lease = get_lease (api )
167
184
if lease is None :
168
- logging .error (f"[ { pod_name } ] Lease not found." )
185
+ logging .error (f"Lease not found for Pod { pod_name } ." )
169
186
leader_active = False
170
187
break
171
188
if lease .spec .holder_identity != pod_name :
172
- logging .error (f"[ { pod_name } ] Leadership lost (current leader: { lease .spec .holder_identity } )." )
189
+ logging .error (f"Pod { pod_name } has lost leadership. (current leader: { lease .spec .holder_identity } )." )
173
190
leader_active = False
174
191
break
175
192
lease .spec .renew_time = now
176
193
api .replace_namespaced_lease (lease_name , lease_namespace , lease )
177
- logging .info (f"[ { pod_name } ] Renewed leadership at { now .isoformat ()} ." )
194
+ logging .info (f"Pod { pod_name } has renewed leadership at { now .isoformat ()} ." )
178
195
except client .exceptions .ApiException as e :
179
- logging .error (f"[ { pod_name } ] Error renewing lease: { e } " )
196
+ logging .error (f"Pod { pod_name } had an error renewing lease: { e } " )
180
197
leader_active = False
181
198
break
182
199
@@ -372,28 +389,29 @@ def periodic_check():
372
389
def main ():
373
390
global leader_active
374
391
logging .info ("Starting cert-manager-key-vault-sync process." )
375
- schedule_version_check ()
376
- load_initial_state ()
377
-
378
- # Start Prometheus metrics server on port 8000
379
- start_http_server (8000 )
380
- logging .info ("Prometheus metrics server started on port 8000" )
381
392
382
393
coordination_api = client .CoordinationV1Api ()
383
394
while True :
384
395
if try_acquire_leadership (coordination_api ):
385
396
threading .Thread (target = renew_leadership , args = (coordination_api ,), daemon = True ).start ()
386
- logging .info (f"Acquired leadership as { pod_name } . Starting sync loop." )
397
+ logging .info (f"Pod { pod_name } acquired leadership. Starting sync loop." )
398
+ init_key_vault_client ()
399
+ start_http_server (8000 )
400
+ logging .info ("Prometheus metrics server started on port 8000" )
387
401
break
388
402
else :
389
- logging .debug (f"Not the leader, retrying in { renew_interval_seconds } seconds." )
403
+ logging .debug (f"This Pod ( { pod_name } ) is not the leader, retrying in { renew_interval_seconds } seconds." )
390
404
time .sleep (renew_interval_seconds )
391
405
392
406
# Only run the following if this replica is the leader.
393
407
while leader_active :
394
408
sync_total .inc ()
395
409
sync_start = time .time ()
396
410
try :
411
+ schedule_version_check ()
412
+ load_initial_state ()
413
+
414
+ # Start Prometheus metrics server on port 8000
397
415
sync_k8s_secrets_to_key_vault ()
398
416
sync_success_total .inc ()
399
417
except Exception as e :
@@ -409,6 +427,5 @@ def main():
409
427
logging .error ("Leadership lost, exiting process." )
410
428
exit (1 )
411
429
412
-
413
430
if __name__ == "__main__" :
414
431
main ()
0 commit comments