Skip to content

Commit d8a17a7

Browse files
rkuo-danswerRichard Kuo (Danswer)
andauthored
try using a redis replica in some areas (#3748)
* try using a redis replica in some areas * harden up replica usage * comment * slow down cloud dispatch temporarily * add ignored syncing list back * raise multiplier to 8 * comment out per tenant code (no longer used by fanout) --------- Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>
1 parent cbf98c0 commit d8a17a7

File tree

8 files changed

+112
-58
lines changed

8 files changed

+112
-58
lines changed

backend/onyx/background/celery/apps/beat.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
from datetime import timedelta
22
from typing import Any
3-
from typing import cast
43

54
from celery import Celery
65
from celery import signals
76
from celery.beat import PersistentScheduler # type: ignore
87
from celery.signals import beat_init
98

109
import onyx.background.celery.apps.app_base as app_base
11-
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
1210
from onyx.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME
1311
from onyx.db.engine import get_all_tenant_ids
1412
from onyx.db.engine import SqlEngine
@@ -132,21 +130,25 @@ def _try_updating_schedule(self) -> None:
132130
# get current schedule and extract current tenants
133131
current_schedule = self.schedule.items()
134132

135-
current_tenants = set()
136-
for task_name, _ in current_schedule:
137-
task_name = cast(str, task_name)
138-
if task_name.startswith(ONYX_CLOUD_CELERY_TASK_PREFIX):
139-
continue
140-
141-
if "_" in task_name:
142-
# example: "check-for-condition-tenant_12345678-abcd-efgh-ijkl-12345678"
143-
# -> "12345678-abcd-efgh-ijkl-12345678"
144-
current_tenants.add(task_name.split("_")[-1])
145-
logger.info(f"Found {len(current_tenants)} existing items in schedule")
146-
147-
for tenant_id in tenant_ids:
148-
if tenant_id not in current_tenants:
149-
logger.info(f"Processing new tenant: {tenant_id}")
133+
# there are no more per tenant beat tasks, so comment this out
134+
# NOTE: we may not actualy need this scheduler any more and should
135+
# test reverting to a regular beat schedule implementation
136+
137+
# current_tenants = set()
138+
# for task_name, _ in current_schedule:
139+
# task_name = cast(str, task_name)
140+
# if task_name.startswith(ONYX_CLOUD_CELERY_TASK_PREFIX):
141+
# continue
142+
143+
# if "_" in task_name:
144+
# # example: "check-for-condition-tenant_12345678-abcd-efgh-ijkl-12345678"
145+
# # -> "12345678-abcd-efgh-ijkl-12345678"
146+
# current_tenants.add(task_name.split("_")[-1])
147+
# logger.info(f"Found {len(current_tenants)} existing items in schedule")
148+
149+
# for tenant_id in tenant_ids:
150+
# if tenant_id not in current_tenants:
151+
# logger.info(f"Processing new tenant: {tenant_id}")
150152

151153
new_schedule = self._generate_schedule(tenant_ids)
152154

backend/onyx/background/celery/tasks/beat_schedule.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
# it's only important that they run relatively regularly
1717
BEAT_EXPIRES_DEFAULT = 15 * 60 # 15 minutes (in seconds)
1818

19+
# hack to slow down task dispatch in the cloud until
20+
# we have a better implementation (backpressure, etc)
21+
CLOUD_BEAT_SCHEDULE_MULTIPLIER = 8
22+
1923
# tasks that only run in the cloud
2024
# the name attribute must start with ONYX_CLOUD_CELERY_TASK_PREFIX = "cloud" to be filtered
2125
# by the DynamicTenantScheduler
@@ -24,7 +28,7 @@
2428
{
2529
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-alembic",
2630
"task": OnyxCeleryTask.CLOUD_CHECK_ALEMBIC,
27-
"schedule": timedelta(hours=1),
31+
"schedule": timedelta(hours=1 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
2832
"options": {
2933
"queue": OnyxCeleryQueues.MONITORING,
3034
"priority": OnyxCeleryPriority.HIGH,
@@ -35,7 +39,7 @@
3539
{
3640
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-indexing",
3741
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
38-
"schedule": timedelta(seconds=15),
42+
"schedule": timedelta(seconds=15 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
3943
"options": {
4044
"priority": OnyxCeleryPriority.HIGHEST,
4145
"expires": BEAT_EXPIRES_DEFAULT,
@@ -47,7 +51,7 @@
4751
{
4852
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-connector-deletion",
4953
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
50-
"schedule": timedelta(seconds=20),
54+
"schedule": timedelta(seconds=20 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
5155
"options": {
5256
"priority": OnyxCeleryPriority.HIGHEST,
5357
"expires": BEAT_EXPIRES_DEFAULT,
@@ -59,7 +63,7 @@
5963
{
6064
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-vespa-sync",
6165
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
62-
"schedule": timedelta(seconds=20),
66+
"schedule": timedelta(seconds=20 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
6367
"options": {
6468
"priority": OnyxCeleryPriority.HIGHEST,
6569
"expires": BEAT_EXPIRES_DEFAULT,
@@ -71,7 +75,7 @@
7175
{
7276
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-prune",
7377
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
74-
"schedule": timedelta(seconds=15),
78+
"schedule": timedelta(seconds=15 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
7579
"options": {
7680
"priority": OnyxCeleryPriority.HIGHEST,
7781
"expires": BEAT_EXPIRES_DEFAULT,
@@ -83,7 +87,7 @@
8387
{
8488
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor-vespa-sync",
8589
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
86-
"schedule": timedelta(seconds=5),
90+
"schedule": timedelta(seconds=15 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
8791
"options": {
8892
"priority": OnyxCeleryPriority.HIGHEST,
8993
"expires": BEAT_EXPIRES_DEFAULT,
@@ -95,7 +99,7 @@
9599
{
96100
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-doc-permissions-sync",
97101
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
98-
"schedule": timedelta(seconds=30),
102+
"schedule": timedelta(seconds=30 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
99103
"options": {
100104
"priority": OnyxCeleryPriority.HIGHEST,
101105
"expires": BEAT_EXPIRES_DEFAULT,
@@ -107,7 +111,7 @@
107111
{
108112
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-external-group-sync",
109113
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
110-
"schedule": timedelta(seconds=20),
114+
"schedule": timedelta(seconds=20 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
111115
"options": {
112116
"priority": OnyxCeleryPriority.HIGHEST,
113117
"expires": BEAT_EXPIRES_DEFAULT,
@@ -119,7 +123,7 @@
119123
{
120124
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_monitor-background-processes",
121125
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
122-
"schedule": timedelta(minutes=5),
126+
"schedule": timedelta(minutes=5 * CLOUD_BEAT_SCHEDULE_MULTIPLIER),
123127
"options": {
124128
"priority": OnyxCeleryPriority.HIGHEST,
125129
"expires": BEAT_EXPIRES_DEFAULT,
@@ -137,7 +141,9 @@
137141
{
138142
"name": f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_check-for-llm-model-update",
139143
"task": OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
140-
"schedule": timedelta(hours=1), # Check every hour
144+
"schedule": timedelta(
145+
hours=1 * CLOUD_BEAT_SCHEDULE_MULTIPLIER
146+
), # Check every hour
141147
"options": {
142148
"priority": OnyxCeleryPriority.HIGHEST,
143149
"expires": BEAT_EXPIRES_DEFAULT,

backend/onyx/background/celery/tasks/indexing/tasks.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from onyx.natural_language_processing.search_nlp_models import warm_up_bi_encoder
4646
from onyx.redis.redis_connector import RedisConnector
4747
from onyx.redis.redis_pool import get_redis_client
48+
from onyx.redis.redis_pool import get_redis_replica_client
4849
from onyx.redis.redis_pool import redis_lock_dump
4950
from onyx.utils.logger import setup_logger
5051
from onyx.utils.variable_functionality import global_version
@@ -69,6 +70,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
6970
tasks_created = 0
7071
locked = False
7172
redis_client = get_redis_client(tenant_id=tenant_id)
73+
redis_client_replica = get_redis_replica_client(tenant_id=tenant_id)
7274

7375
# we need to use celery's redis client to access its redis data
7476
# (which lives on a different db number)
@@ -227,7 +229,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
227229
# or be currently executing
228230
try:
229231
validate_indexing_fences(
230-
tenant_id, self.app, redis_client, redis_client_celery, lock_beat
232+
tenant_id, redis_client_replica, redis_client_celery, lock_beat
231233
)
232234
except Exception:
233235
task_logger.exception("Exception while validating indexing fences")

backend/onyx/background/celery/tasks/indexing/utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,7 @@ def validate_indexing_fence(
291291

292292
def validate_indexing_fences(
293293
tenant_id: str | None,
294-
celery_app: Celery,
295-
r: Redis,
294+
r_replica: Redis,
296295
r_celery: Redis,
297296
lock_beat: RedisLock,
298297
) -> None:
@@ -301,7 +300,9 @@ def validate_indexing_fences(
301300
)
302301

303302
# validate all existing indexing jobs
304-
for key_bytes in r.scan_iter(
303+
# Use replica for this because the worst thing that happens
304+
# is that we don't run the validation on this pass
305+
for key_bytes in r_replica.scan_iter(
305306
RedisConnectorIndex.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
306307
):
307308
lock_beat.reacquire()

backend/onyx/background/celery/tasks/shared/tasks.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from onyx.redis.redis_pool import get_redis_client
3434
from onyx.redis.redis_pool import redis_lock_dump
3535
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
36+
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
3637

3738
DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES = 3
3839

@@ -247,6 +248,10 @@ def cloud_beat_task_generator(
247248
lock_beat.reacquire()
248249
last_lock_time = current_time
249250

251+
# needed in the cloud
252+
if IGNORED_SYNCING_TENANT_LIST and tenant_id in IGNORED_SYNCING_TENANT_LIST:
253+
continue
254+
250255
self.app.send_task(
251256
task_name,
252257
kwargs=dict(

backend/onyx/background/celery/tasks/vespa/tasks.py

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
from onyx.redis.redis_connector_prune import RedisConnectorPrune
7979
from onyx.redis.redis_document_set import RedisDocumentSet
8080
from onyx.redis.redis_pool import get_redis_client
81+
from onyx.redis.redis_pool import get_redis_replica_client
8182
from onyx.redis.redis_pool import redis_lock_dump
8283
from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT
8384
from onyx.redis.redis_usergroup import RedisUserGroup
@@ -895,6 +896,17 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
895896

896897
r = get_redis_client(tenant_id=tenant_id)
897898

899+
# Replica usage notes
900+
#
901+
# False negatives are OK. (aka fail to to see a key that exists on the master).
902+
# We simply skip the monitoring work and it will be caught on the next pass.
903+
#
904+
# False positives are not OK, and are possible if we clear a fence on the master and
905+
# then read from the replica. In this case, monitoring work could be done on a fence
906+
# that no longer exists. To avoid this, we scan from the replica, but double check
907+
# the result on the master.
908+
r_replica = get_redis_replica_client(tenant_id=tenant_id)
909+
898910
lock_beat: RedisLock = r.lock(
899911
OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK,
900912
timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
@@ -954,17 +966,19 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
954966
# scan and monitor activity to completion
955967
phase_start = time.monotonic()
956968
lock_beat.reacquire()
957-
if r.exists(RedisConnectorCredentialPair.get_fence_key()):
958-
monitor_connector_taskset(r)
969+
if r_replica.exists(RedisConnectorCredentialPair.get_fence_key()):
970+
if r.exists(RedisConnectorCredentialPair.get_fence_key()):
971+
monitor_connector_taskset(r)
959972
timings["connector"] = time.monotonic() - phase_start
960973
timings["connector_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
961974

962975
phase_start = time.monotonic()
963976
lock_beat.reacquire()
964-
for key_bytes in r.scan_iter(
977+
for key_bytes in r_replica.scan_iter(
965978
RedisConnectorDelete.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
966979
):
967-
monitor_connector_deletion_taskset(tenant_id, key_bytes, r)
980+
if r.exists(key_bytes):
981+
monitor_connector_deletion_taskset(tenant_id, key_bytes, r)
968982
lock_beat.reacquire()
969983

970984
timings["connector_deletion"] = time.monotonic() - phase_start
@@ -974,66 +988,74 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool | None:
974988

975989
phase_start = time.monotonic()
976990
lock_beat.reacquire()
977-
for key_bytes in r.scan_iter(
991+
for key_bytes in r_replica.scan_iter(
978992
RedisDocumentSet.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
979993
):
980-
with get_session_with_tenant(tenant_id) as db_session:
981-
monitor_document_set_taskset(tenant_id, key_bytes, r, db_session)
994+
if r.exists(key_bytes):
995+
with get_session_with_tenant(tenant_id) as db_session:
996+
monitor_document_set_taskset(tenant_id, key_bytes, r, db_session)
982997
lock_beat.reacquire()
983998
timings["documentset"] = time.monotonic() - phase_start
984999
timings["documentset_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
9851000

9861001
phase_start = time.monotonic()
9871002
lock_beat.reacquire()
988-
for key_bytes in r.scan_iter(
1003+
for key_bytes in r_replica.scan_iter(
9891004
RedisUserGroup.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
9901005
):
991-
monitor_usergroup_taskset = fetch_versioned_implementation_with_fallback(
992-
"onyx.background.celery.tasks.vespa.tasks",
993-
"monitor_usergroup_taskset",
994-
noop_fallback,
995-
)
996-
with get_session_with_tenant(tenant_id) as db_session:
997-
monitor_usergroup_taskset(tenant_id, key_bytes, r, db_session)
1006+
if r.exists(key_bytes):
1007+
monitor_usergroup_taskset = (
1008+
fetch_versioned_implementation_with_fallback(
1009+
"onyx.background.celery.tasks.vespa.tasks",
1010+
"monitor_usergroup_taskset",
1011+
noop_fallback,
1012+
)
1013+
)
1014+
with get_session_with_tenant(tenant_id) as db_session:
1015+
monitor_usergroup_taskset(tenant_id, key_bytes, r, db_session)
9981016
lock_beat.reacquire()
9991017
timings["usergroup"] = time.monotonic() - phase_start
10001018
timings["usergroup_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
10011019

10021020
phase_start = time.monotonic()
10031021
lock_beat.reacquire()
1004-
for key_bytes in r.scan_iter(
1022+
for key_bytes in r_replica.scan_iter(
10051023
RedisConnectorPrune.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
10061024
):
1007-
with get_session_with_tenant(tenant_id) as db_session:
1008-
monitor_ccpair_pruning_taskset(tenant_id, key_bytes, r, db_session)
1025+
if r.exists(key_bytes):
1026+
with get_session_with_tenant(tenant_id) as db_session:
1027+
monitor_ccpair_pruning_taskset(tenant_id, key_bytes, r, db_session)
10091028
lock_beat.reacquire()
10101029
timings["pruning"] = time.monotonic() - phase_start
10111030
timings["pruning_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
10121031

10131032
phase_start = time.monotonic()
10141033
lock_beat.reacquire()
1015-
for key_bytes in r.scan_iter(
1034+
for key_bytes in r_replica.scan_iter(
10161035
RedisConnectorIndex.FENCE_PREFIX + "*", count=SCAN_ITER_COUNT_DEFAULT
10171036
):
1018-
with get_session_with_tenant(tenant_id) as db_session:
1019-
monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session)
1037+
if r.exists(key_bytes):
1038+
with get_session_with_tenant(tenant_id) as db_session:
1039+
monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session)
10201040
lock_beat.reacquire()
10211041
timings["indexing"] = time.monotonic() - phase_start
10221042
timings["indexing_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
10231043

10241044
phase_start = time.monotonic()
10251045
lock_beat.reacquire()
1026-
for key_bytes in r.scan_iter(
1046+
for key_bytes in r_replica.scan_iter(
10271047
RedisConnectorPermissionSync.FENCE_PREFIX + "*",
10281048
count=SCAN_ITER_COUNT_DEFAULT,
10291049
):
1030-
with get_session_with_tenant(tenant_id) as db_session:
1031-
monitor_ccpair_permissions_taskset(tenant_id, key_bytes, r, db_session)
1050+
if r.exists(key_bytes):
1051+
with get_session_with_tenant(tenant_id) as db_session:
1052+
monitor_ccpair_permissions_taskset(
1053+
tenant_id, key_bytes, r, db_session
1054+
)
10321055
lock_beat.reacquire()
10331056

10341057
timings["permissions"] = time.monotonic() - phase_start
10351058
timings["permissions_ttl"] = r.ttl(OnyxRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
1036-
10371059
except SoftTimeLimitExceeded:
10381060
task_logger.info(
10391061
"Soft time limit exceeded, task is being terminated gracefully."

backend/onyx/configs/app_configs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@
200200
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
201201
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD") or ""
202202

203+
# this assumes that other redis settings remain the same as the primary
204+
REDIS_REPLICA_HOST = os.environ.get("REDIS_REPLICA_HOST") or REDIS_HOST
203205

204206
REDIS_AUTH_KEY_PREFIX = "fastapi_users_token:"
205207

0 commit comments

Comments
 (0)