78
78
from onyx .redis .redis_connector_prune import RedisConnectorPrune
79
79
from onyx .redis .redis_document_set import RedisDocumentSet
80
80
from onyx .redis .redis_pool import get_redis_client
81
+ from onyx .redis .redis_pool import get_redis_replica_client
81
82
from onyx .redis .redis_pool import redis_lock_dump
82
83
from onyx .redis .redis_pool import SCAN_ITER_COUNT_DEFAULT
83
84
from onyx .redis .redis_usergroup import RedisUserGroup
@@ -895,6 +896,17 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
895
896
896
897
r = get_redis_client (tenant_id = tenant_id )
897
898
899
+ # Replica usage notes
900
+ #
901
+ # False negatives are OK. (aka fail to to see a key that exists on the master).
902
+ # We simply skip the monitoring work and it will be caught on the next pass.
903
+ #
904
+ # False positives are not OK, and are possible if we clear a fence on the master and
905
+ # then read from the replica. In this case, monitoring work could be done on a fence
906
+ # that no longer exists. To avoid this, we scan from the replica, but double check
907
+ # the result on the master.
908
+ r_replica = get_redis_replica_client (tenant_id = tenant_id )
909
+
898
910
lock_beat : RedisLock = r .lock (
899
911
OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK ,
900
912
timeout = CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT ,
@@ -954,17 +966,19 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
954
966
# scan and monitor activity to completion
955
967
phase_start = time .monotonic ()
956
968
lock_beat .reacquire ()
957
- if r .exists (RedisConnectorCredentialPair .get_fence_key ()):
958
- monitor_connector_taskset (r )
969
+ if r_replica .exists (RedisConnectorCredentialPair .get_fence_key ()):
970
+ if r .exists (RedisConnectorCredentialPair .get_fence_key ()):
971
+ monitor_connector_taskset (r )
959
972
timings ["connector" ] = time .monotonic () - phase_start
960
973
timings ["connector_ttl" ] = r .ttl (OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK )
961
974
962
975
phase_start = time .monotonic ()
963
976
lock_beat .reacquire ()
964
- for key_bytes in r .scan_iter (
977
+ for key_bytes in r_replica .scan_iter (
965
978
RedisConnectorDelete .FENCE_PREFIX + "*" , count = SCAN_ITER_COUNT_DEFAULT
966
979
):
967
- monitor_connector_deletion_taskset (tenant_id , key_bytes , r )
980
+ if r .exists (key_bytes ):
981
+ monitor_connector_deletion_taskset (tenant_id , key_bytes , r )
968
982
lock_beat .reacquire ()
969
983
970
984
timings ["connector_deletion" ] = time .monotonic () - phase_start
@@ -974,66 +988,74 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
974
988
975
989
phase_start = time .monotonic ()
976
990
lock_beat .reacquire ()
977
- for key_bytes in r .scan_iter (
991
+ for key_bytes in r_replica .scan_iter (
978
992
RedisDocumentSet .FENCE_PREFIX + "*" , count = SCAN_ITER_COUNT_DEFAULT
979
993
):
980
- with get_session_with_tenant (tenant_id ) as db_session :
981
- monitor_document_set_taskset (tenant_id , key_bytes , r , db_session )
994
+ if r .exists (key_bytes ):
995
+ with get_session_with_tenant (tenant_id ) as db_session :
996
+ monitor_document_set_taskset (tenant_id , key_bytes , r , db_session )
982
997
lock_beat .reacquire ()
983
998
timings ["documentset" ] = time .monotonic () - phase_start
984
999
timings ["documentset_ttl" ] = r .ttl (OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK )
985
1000
986
1001
phase_start = time .monotonic ()
987
1002
lock_beat .reacquire ()
988
- for key_bytes in r .scan_iter (
1003
+ for key_bytes in r_replica .scan_iter (
989
1004
RedisUserGroup .FENCE_PREFIX + "*" , count = SCAN_ITER_COUNT_DEFAULT
990
1005
):
991
- monitor_usergroup_taskset = fetch_versioned_implementation_with_fallback (
992
- "onyx.background.celery.tasks.vespa.tasks" ,
993
- "monitor_usergroup_taskset" ,
994
- noop_fallback ,
995
- )
996
- with get_session_with_tenant (tenant_id ) as db_session :
997
- monitor_usergroup_taskset (tenant_id , key_bytes , r , db_session )
1006
+ if r .exists (key_bytes ):
1007
+ monitor_usergroup_taskset = (
1008
+ fetch_versioned_implementation_with_fallback (
1009
+ "onyx.background.celery.tasks.vespa.tasks" ,
1010
+ "monitor_usergroup_taskset" ,
1011
+ noop_fallback ,
1012
+ )
1013
+ )
1014
+ with get_session_with_tenant (tenant_id ) as db_session :
1015
+ monitor_usergroup_taskset (tenant_id , key_bytes , r , db_session )
998
1016
lock_beat .reacquire ()
999
1017
timings ["usergroup" ] = time .monotonic () - phase_start
1000
1018
timings ["usergroup_ttl" ] = r .ttl (OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK )
1001
1019
1002
1020
phase_start = time .monotonic ()
1003
1021
lock_beat .reacquire ()
1004
- for key_bytes in r .scan_iter (
1022
+ for key_bytes in r_replica .scan_iter (
1005
1023
RedisConnectorPrune .FENCE_PREFIX + "*" , count = SCAN_ITER_COUNT_DEFAULT
1006
1024
):
1007
- with get_session_with_tenant (tenant_id ) as db_session :
1008
- monitor_ccpair_pruning_taskset (tenant_id , key_bytes , r , db_session )
1025
+ if r .exists (key_bytes ):
1026
+ with get_session_with_tenant (tenant_id ) as db_session :
1027
+ monitor_ccpair_pruning_taskset (tenant_id , key_bytes , r , db_session )
1009
1028
lock_beat .reacquire ()
1010
1029
timings ["pruning" ] = time .monotonic () - phase_start
1011
1030
timings ["pruning_ttl" ] = r .ttl (OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK )
1012
1031
1013
1032
phase_start = time .monotonic ()
1014
1033
lock_beat .reacquire ()
1015
- for key_bytes in r .scan_iter (
1034
+ for key_bytes in r_replica .scan_iter (
1016
1035
RedisConnectorIndex .FENCE_PREFIX + "*" , count = SCAN_ITER_COUNT_DEFAULT
1017
1036
):
1018
- with get_session_with_tenant (tenant_id ) as db_session :
1019
- monitor_ccpair_indexing_taskset (tenant_id , key_bytes , r , db_session )
1037
+ if r .exists (key_bytes ):
1038
+ with get_session_with_tenant (tenant_id ) as db_session :
1039
+ monitor_ccpair_indexing_taskset (tenant_id , key_bytes , r , db_session )
1020
1040
lock_beat .reacquire ()
1021
1041
timings ["indexing" ] = time .monotonic () - phase_start
1022
1042
timings ["indexing_ttl" ] = r .ttl (OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK )
1023
1043
1024
1044
phase_start = time .monotonic ()
1025
1045
lock_beat .reacquire ()
1026
- for key_bytes in r .scan_iter (
1046
+ for key_bytes in r_replica .scan_iter (
1027
1047
RedisConnectorPermissionSync .FENCE_PREFIX + "*" ,
1028
1048
count = SCAN_ITER_COUNT_DEFAULT ,
1029
1049
):
1030
- with get_session_with_tenant (tenant_id ) as db_session :
1031
- monitor_ccpair_permissions_taskset (tenant_id , key_bytes , r , db_session )
1050
+ if r .exists (key_bytes ):
1051
+ with get_session_with_tenant (tenant_id ) as db_session :
1052
+ monitor_ccpair_permissions_taskset (
1053
+ tenant_id , key_bytes , r , db_session
1054
+ )
1032
1055
lock_beat .reacquire ()
1033
1056
1034
1057
timings ["permissions" ] = time .monotonic () - phase_start
1035
1058
timings ["permissions_ttl" ] = r .ttl (OnyxRedisLocks .MONITOR_VESPA_SYNC_BEAT_LOCK )
1036
-
1037
1059
except SoftTimeLimitExceeded :
1038
1060
task_logger .info (
1039
1061
"Soft time limit exceeded, task is being terminated gracefully."
0 commit comments