Skip to content

Commit 95701db

Browse files
authored
Add more sync records + fix small bug in monitoring task causing deletion metrics to never be emitted (#3837)
Double check we don't double-emit + fix pruning metric Add log Fix comment rename
1 parent 2410525 commit 95701db

File tree

7 files changed

+275
-106
lines changed

7 files changed

+275
-106
lines changed

backend/onyx/background/celery/tasks/doc_permission_syncing/tasks.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from celery.exceptions import SoftTimeLimitExceeded
1212
from redis import Redis
1313
from redis.lock import Lock as RedisLock
14+
from sqlalchemy.orm import Session
1415

1516
from ee.onyx.db.connector_credential_pair import get_all_auto_sync_cc_pairs
1617
from ee.onyx.db.document import upsert_document_external_perms
@@ -31,12 +32,17 @@
3132
from onyx.configs.constants import OnyxCeleryQueues
3233
from onyx.configs.constants import OnyxCeleryTask
3334
from onyx.configs.constants import OnyxRedisLocks
35+
from onyx.db.connector import mark_cc_pair_as_permissions_synced
3436
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
3537
from onyx.db.document import upsert_document_by_connector_credential_pair
3638
from onyx.db.engine import get_session_with_tenant
3739
from onyx.db.enums import AccessType
3840
from onyx.db.enums import ConnectorCredentialPairStatus
41+
from onyx.db.enums import SyncStatus
42+
from onyx.db.enums import SyncType
3943
from onyx.db.models import ConnectorCredentialPair
44+
from onyx.db.sync_record import insert_sync_record
45+
from onyx.db.sync_record import update_sync_record_status
4046
from onyx.db.users import batch_add_ext_perm_user_if_not_exists
4147
from onyx.redis.redis_connector import RedisConnector
4248
from onyx.redis.redis_connector_doc_perm_sync import (
@@ -57,6 +63,9 @@
5763
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
5864

5965

66+
"""Jobs / utils for kicking off doc permissions sync tasks."""
67+
68+
6069
def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
6170
"""Returns boolean indicating if external doc permissions sync is due."""
6271

@@ -174,6 +183,15 @@ def try_creating_permissions_sync_task(
174183

175184
custom_task_id = f"{redis_connector.permissions.generator_task_key}_{uuid4()}"
176185

186+
# create before setting fence to avoid race condition where the monitoring
187+
# task updates the sync record before it is created
188+
with get_session_with_tenant(tenant_id) as db_session:
189+
insert_sync_record(
190+
db_session=db_session,
191+
entity_id=cc_pair_id,
192+
sync_type=SyncType.EXTERNAL_PERMISSIONS,
193+
)
194+
177195
# set a basic fence to start
178196
payload = RedisConnectorPermissionSyncPayload(started=None, celery_task_id=None)
179197
redis_connector.permissions.set_fence(payload)
@@ -400,3 +418,53 @@ def update_external_document_permissions_task(
400418
f"Error Syncing Document Permissions: connector_id={connector_id} doc_id={doc_id}"
401419
)
402420
return False
421+
422+
423+
"""Monitoring CCPair permissions utils, called in monitor_vespa_sync"""
424+
425+
426+
def monitor_ccpair_permissions_taskset(
427+
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
428+
) -> None:
429+
fence_key = key_bytes.decode("utf-8")
430+
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
431+
if cc_pair_id_str is None:
432+
task_logger.warning(
433+
f"monitor_ccpair_permissions_taskset: could not parse cc_pair_id from {fence_key}"
434+
)
435+
return
436+
437+
cc_pair_id = int(cc_pair_id_str)
438+
439+
redis_connector = RedisConnector(tenant_id, cc_pair_id)
440+
if not redis_connector.permissions.fenced:
441+
return
442+
443+
initial = redis_connector.permissions.generator_complete
444+
if initial is None:
445+
return
446+
447+
remaining = redis_connector.permissions.get_remaining()
448+
task_logger.info(
449+
f"Permissions sync progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
450+
)
451+
if remaining > 0:
452+
return
453+
454+
payload: RedisConnectorPermissionSyncPayload | None = (
455+
redis_connector.permissions.payload
456+
)
457+
start_time: datetime | None = payload.started if payload else None
458+
459+
mark_cc_pair_as_permissions_synced(db_session, int(cc_pair_id), start_time)
460+
task_logger.info(f"Successfully synced permissions for cc_pair={cc_pair_id}")
461+
462+
update_sync_record_status(
463+
db_session=db_session,
464+
entity_id=cc_pair_id,
465+
sync_type=SyncType.EXTERNAL_PERMISSIONS,
466+
sync_status=SyncStatus.SUCCESS,
467+
num_docs_synced=initial,
468+
)
469+
470+
redis_connector.permissions.reset()

backend/onyx/background/celery/tasks/external_group_syncing/tasks.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@
3333
from onyx.db.engine import get_session_with_tenant
3434
from onyx.db.enums import AccessType
3535
from onyx.db.enums import ConnectorCredentialPairStatus
36+
from onyx.db.enums import SyncStatus
37+
from onyx.db.enums import SyncType
3638
from onyx.db.models import ConnectorCredentialPair
39+
from onyx.db.sync_record import insert_sync_record
40+
from onyx.db.sync_record import update_sync_record_status
3741
from onyx.redis.redis_connector import RedisConnector
3842
from onyx.redis.redis_connector_ext_group_sync import (
3943
RedisConnectorExternalGroupSyncPayload,
@@ -200,6 +204,15 @@ def try_creating_external_group_sync_task(
200204
celery_task_id=result.id,
201205
)
202206

207+
# create before setting fence to avoid race condition where the monitoring
208+
# task updates the sync record before it is created
209+
with get_session_with_tenant(tenant_id) as db_session:
210+
insert_sync_record(
211+
db_session=db_session,
212+
entity_id=cc_pair_id,
213+
sync_type=SyncType.EXTERNAL_GROUP,
214+
)
215+
203216
redis_connector.external_group_sync.set_fence(payload)
204217

205218
except Exception:
@@ -289,11 +302,26 @@ def connector_external_group_sync_generator_task(
289302
)
290303

291304
mark_cc_pair_as_external_group_synced(db_session, cc_pair.id)
305+
306+
update_sync_record_status(
307+
db_session=db_session,
308+
entity_id=cc_pair_id,
309+
sync_type=SyncType.EXTERNAL_GROUP,
310+
sync_status=SyncStatus.SUCCESS,
311+
)
292312
except Exception as e:
293313
task_logger.exception(
294314
f"Failed to run external group sync: cc_pair={cc_pair_id}"
295315
)
296316

317+
with get_session_with_tenant(tenant_id) as db_session:
318+
update_sync_record_status(
319+
db_session=db_session,
320+
entity_id=cc_pair_id,
321+
sync_type=SyncType.EXTERNAL_GROUP,
322+
sync_status=SyncStatus.FAILED,
323+
)
324+
297325
redis_connector.external_group_sync.generator_clear()
298326
redis_connector.external_group_sync.taskset_clear()
299327
raise e

backend/onyx/background/celery/tasks/monitoring/tasks.py

Lines changed: 106 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@
5858
"sync_start_latency:{sync_type}:{entity_id}:{sync_record_id}"
5959
)
6060

61+
_CONNECTOR_START_TIME_KEY_FMT = "connector_start_time:{cc_pair_id}:{index_attempt_id}"
62+
_CONNECTOR_END_TIME_KEY_FMT = "connector_end_time:{cc_pair_id}:{index_attempt_id}"
63+
_SYNC_START_TIME_KEY_FMT = "sync_start_time:{sync_type}:{entity_id}:{sync_record_id}"
64+
_SYNC_END_TIME_KEY_FMT = "sync_end_time:{sync_type}:{entity_id}:{sync_record_id}"
65+
6166

6267
def _mark_metric_as_emitted(redis_std: Redis, key: str) -> None:
6368
"""Mark a metric as having been emitted by setting a Redis key with expiration"""
@@ -303,8 +308,6 @@ def _build_connector_final_metrics(
303308
)
304309
)
305310

306-
_mark_metric_as_emitted(redis_std, metric_key)
307-
308311
return metrics
309312

310313

@@ -344,6 +347,52 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
344347
if one_hour_ago > most_recent_attempt.time_created:
345348
continue
346349

350+
# Build a job_id for correlation
351+
job_id = build_job_id(
352+
"connector", str(cc_pair.id), str(most_recent_attempt.id)
353+
)
354+
355+
# Add raw start time metric if available
356+
if most_recent_attempt.time_started:
357+
start_time_key = _CONNECTOR_START_TIME_KEY_FMT.format(
358+
cc_pair_id=cc_pair.id,
359+
index_attempt_id=most_recent_attempt.id,
360+
)
361+
metrics.append(
362+
Metric(
363+
key=start_time_key,
364+
name="connector_start_time",
365+
value=most_recent_attempt.time_started.timestamp(),
366+
tags={
367+
"job_id": job_id,
368+
"connector_id": str(cc_pair.connector.id),
369+
"source": str(cc_pair.connector.source),
370+
},
371+
)
372+
)
373+
374+
# Add raw end time metric if available and in terminal state
375+
if (
376+
most_recent_attempt.status.is_terminal()
377+
and most_recent_attempt.time_updated
378+
):
379+
end_time_key = _CONNECTOR_END_TIME_KEY_FMT.format(
380+
cc_pair_id=cc_pair.id,
381+
index_attempt_id=most_recent_attempt.id,
382+
)
383+
metrics.append(
384+
Metric(
385+
key=end_time_key,
386+
name="connector_end_time",
387+
value=most_recent_attempt.time_updated.timestamp(),
388+
tags={
389+
"job_id": job_id,
390+
"connector_id": str(cc_pair.connector.id),
391+
"source": str(cc_pair.connector.source),
392+
},
393+
)
394+
)
395+
347396
# Connector start latency
348397
start_latency_metric = _build_connector_start_latency_metric(
349398
cc_pair, most_recent_attempt, second_most_recent_attempt, redis_std
@@ -365,9 +414,10 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
365414
"""
366415
Collect metrics for document set and group syncing:
367416
- Success/failure status
368-
- Start latency (always)
417+
- Start latency (for doc sets / user groups)
369418
- Duration & doc count (only if success)
370419
- Throughput (docs/min) (only if success)
420+
- Raw start/end times for each sync
371421
"""
372422
one_hour_ago = get_db_current_time(db_session) - timedelta(hours=1)
373423

@@ -389,6 +439,43 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
389439
# Build a job_id for correlation
390440
job_id = build_job_id("sync_record", str(sync_record.id))
391441

442+
# Add raw start time metric
443+
start_time_key = _SYNC_START_TIME_KEY_FMT.format(
444+
sync_type=sync_record.sync_type,
445+
entity_id=sync_record.entity_id,
446+
sync_record_id=sync_record.id,
447+
)
448+
metrics.append(
449+
Metric(
450+
key=start_time_key,
451+
name="sync_start_time",
452+
value=sync_record.sync_start_time.timestamp(),
453+
tags={
454+
"job_id": job_id,
455+
"sync_type": str(sync_record.sync_type),
456+
},
457+
)
458+
)
459+
460+
# Add raw end time metric if available
461+
if sync_record.sync_end_time:
462+
end_time_key = _SYNC_END_TIME_KEY_FMT.format(
463+
sync_type=sync_record.sync_type,
464+
entity_id=sync_record.entity_id,
465+
sync_record_id=sync_record.id,
466+
)
467+
metrics.append(
468+
Metric(
469+
key=end_time_key,
470+
name="sync_end_time",
471+
value=sync_record.sync_end_time.timestamp(),
472+
tags={
473+
"job_id": job_id,
474+
"sync_type": str(sync_record.sync_type),
475+
},
476+
)
477+
)
478+
392479
# Emit a SUCCESS/FAIL boolean metric
393480
# Use a single Redis key to avoid re-emitting final metrics
394481
final_metric_key = _FINAL_METRIC_KEY_FMT.format(
@@ -439,7 +526,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
439526
if duration_seconds is not None:
440527
metrics.append(
441528
Metric(
442-
key=None,
529+
key=final_metric_key,
443530
name="sync_duration_seconds",
444531
value=duration_seconds,
445532
tags={
@@ -455,7 +542,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
455542

456543
metrics.append(
457544
Metric(
458-
key=None,
545+
key=final_metric_key,
459546
name="sync_doc_count",
460547
value=doc_count,
461548
tags={
@@ -468,7 +555,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
468555
if sync_speed is not None:
469556
metrics.append(
470557
Metric(
471-
key=None,
558+
key=final_metric_key,
472559
name="sync_speed_docs_per_min",
473560
value=sync_speed,
474561
tags={
@@ -482,9 +569,6 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
482569
f"Invalid sync record {sync_record.id} with no duration"
483570
)
484571

485-
# Mark final metrics as emitted so we don't re-emit
486-
_mark_metric_as_emitted(redis_std, final_metric_key)
487-
488572
# Emit start latency
489573
start_latency_key = _SYNC_START_LATENCY_KEY_FMT.format(
490574
sync_type=sync_record.sync_type,
@@ -502,22 +586,20 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
502586
entity = db_session.scalar(
503587
select(UserGroup).where(UserGroup.id == sync_record.entity_id)
504588
)
505-
else:
506-
task_logger.info(
507-
f"Skipping sync record {sync_record.id} of type {sync_record.sync_type}."
508-
)
509-
continue
510589

511590
if entity is None:
512591
task_logger.error(
513-
f"Could not find entity for sync record {sync_record.id} "
514-
f"(type={sync_record.sync_type}, id={sync_record.entity_id})."
592+
f"Sync record of type {sync_record.sync_type} doesn't have an entity "
593+
f"associated with it (id={sync_record.entity_id}). Skipping start latency metric."
515594
)
516-
continue
517595

518596
# Calculate start latency in seconds:
519597
# (actual sync start) - (last modified time)
520-
if entity.time_last_modified_by_user and sync_record.sync_start_time:
598+
if (
599+
entity is not None
600+
and entity.time_last_modified_by_user
601+
and sync_record.sync_start_time
602+
):
521603
start_latency = (
522604
sync_record.sync_start_time - entity.time_last_modified_by_user
523605
).total_seconds()
@@ -541,8 +623,6 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
541623
)
542624
)
543625

544-
_mark_metric_as_emitted(redis_std, start_latency_key)
545-
546626
return metrics
547627

548628

@@ -607,9 +687,12 @@ def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
607687
for metric_fn in metric_functions:
608688
metrics = metric_fn()
609689
for metric in metrics:
610-
metric.log()
611-
metric.emit(tenant_id)
612-
if metric.key:
690+
# double check to make sure we aren't double-emitting metrics
691+
if metric.key is not None and not _has_metric_been_emitted(
692+
redis_std, metric.key
693+
):
694+
metric.log()
695+
metric.emit(tenant_id)
613696
_mark_metric_as_emitted(redis_std, metric.key)
614697

615698
task_logger.info("Successfully collected background metrics")

0 commit comments

Comments
 (0)