58
58
"sync_start_latency:{sync_type}:{entity_id}:{sync_record_id}"
59
59
)
60
60
61
+ _CONNECTOR_START_TIME_KEY_FMT = "connector_start_time:{cc_pair_id}:{index_attempt_id}"
62
+ _CONNECTOR_END_TIME_KEY_FMT = "connector_end_time:{cc_pair_id}:{index_attempt_id}"
63
+ _SYNC_START_TIME_KEY_FMT = "sync_start_time:{sync_type}:{entity_id}:{sync_record_id}"
64
+ _SYNC_END_TIME_KEY_FMT = "sync_end_time:{sync_type}:{entity_id}:{sync_record_id}"
65
+
61
66
62
67
def _mark_metric_as_emitted (redis_std : Redis , key : str ) -> None :
63
68
"""Mark a metric as having been emitted by setting a Redis key with expiration"""
@@ -303,8 +308,6 @@ def _build_connector_final_metrics(
303
308
)
304
309
)
305
310
306
- _mark_metric_as_emitted (redis_std , metric_key )
307
-
308
311
return metrics
309
312
310
313
@@ -344,6 +347,52 @@ def _collect_connector_metrics(db_session: Session, redis_std: Redis) -> list[Me
344
347
if one_hour_ago > most_recent_attempt .time_created :
345
348
continue
346
349
350
+ # Build a job_id for correlation
351
+ job_id = build_job_id (
352
+ "connector" , str (cc_pair .id ), str (most_recent_attempt .id )
353
+ )
354
+
355
+ # Add raw start time metric if available
356
+ if most_recent_attempt .time_started :
357
+ start_time_key = _CONNECTOR_START_TIME_KEY_FMT .format (
358
+ cc_pair_id = cc_pair .id ,
359
+ index_attempt_id = most_recent_attempt .id ,
360
+ )
361
+ metrics .append (
362
+ Metric (
363
+ key = start_time_key ,
364
+ name = "connector_start_time" ,
365
+ value = most_recent_attempt .time_started .timestamp (),
366
+ tags = {
367
+ "job_id" : job_id ,
368
+ "connector_id" : str (cc_pair .connector .id ),
369
+ "source" : str (cc_pair .connector .source ),
370
+ },
371
+ )
372
+ )
373
+
374
+ # Add raw end time metric if available and in terminal state
375
+ if (
376
+ most_recent_attempt .status .is_terminal ()
377
+ and most_recent_attempt .time_updated
378
+ ):
379
+ end_time_key = _CONNECTOR_END_TIME_KEY_FMT .format (
380
+ cc_pair_id = cc_pair .id ,
381
+ index_attempt_id = most_recent_attempt .id ,
382
+ )
383
+ metrics .append (
384
+ Metric (
385
+ key = end_time_key ,
386
+ name = "connector_end_time" ,
387
+ value = most_recent_attempt .time_updated .timestamp (),
388
+ tags = {
389
+ "job_id" : job_id ,
390
+ "connector_id" : str (cc_pair .connector .id ),
391
+ "source" : str (cc_pair .connector .source ),
392
+ },
393
+ )
394
+ )
395
+
347
396
# Connector start latency
348
397
start_latency_metric = _build_connector_start_latency_metric (
349
398
cc_pair , most_recent_attempt , second_most_recent_attempt , redis_std
@@ -365,9 +414,10 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
365
414
"""
366
415
Collect metrics for document set and group syncing:
367
416
- Success/failure status
368
- - Start latency (always )
417
+ - Start latency (for doc sets / user groups )
369
418
- Duration & doc count (only if success)
370
419
- Throughput (docs/min) (only if success)
420
+ - Raw start/end times for each sync
371
421
"""
372
422
one_hour_ago = get_db_current_time (db_session ) - timedelta (hours = 1 )
373
423
@@ -389,6 +439,43 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
389
439
# Build a job_id for correlation
390
440
job_id = build_job_id ("sync_record" , str (sync_record .id ))
391
441
442
+ # Add raw start time metric
443
+ start_time_key = _SYNC_START_TIME_KEY_FMT .format (
444
+ sync_type = sync_record .sync_type ,
445
+ entity_id = sync_record .entity_id ,
446
+ sync_record_id = sync_record .id ,
447
+ )
448
+ metrics .append (
449
+ Metric (
450
+ key = start_time_key ,
451
+ name = "sync_start_time" ,
452
+ value = sync_record .sync_start_time .timestamp (),
453
+ tags = {
454
+ "job_id" : job_id ,
455
+ "sync_type" : str (sync_record .sync_type ),
456
+ },
457
+ )
458
+ )
459
+
460
+ # Add raw end time metric if available
461
+ if sync_record .sync_end_time :
462
+ end_time_key = _SYNC_END_TIME_KEY_FMT .format (
463
+ sync_type = sync_record .sync_type ,
464
+ entity_id = sync_record .entity_id ,
465
+ sync_record_id = sync_record .id ,
466
+ )
467
+ metrics .append (
468
+ Metric (
469
+ key = end_time_key ,
470
+ name = "sync_end_time" ,
471
+ value = sync_record .sync_end_time .timestamp (),
472
+ tags = {
473
+ "job_id" : job_id ,
474
+ "sync_type" : str (sync_record .sync_type ),
475
+ },
476
+ )
477
+ )
478
+
392
479
# Emit a SUCCESS/FAIL boolean metric
393
480
# Use a single Redis key to avoid re-emitting final metrics
394
481
final_metric_key = _FINAL_METRIC_KEY_FMT .format (
@@ -439,7 +526,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
439
526
if duration_seconds is not None :
440
527
metrics .append (
441
528
Metric (
442
- key = None ,
529
+ key = final_metric_key ,
443
530
name = "sync_duration_seconds" ,
444
531
value = duration_seconds ,
445
532
tags = {
@@ -455,7 +542,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
455
542
456
543
metrics .append (
457
544
Metric (
458
- key = None ,
545
+ key = final_metric_key ,
459
546
name = "sync_doc_count" ,
460
547
value = doc_count ,
461
548
tags = {
@@ -468,7 +555,7 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
468
555
if sync_speed is not None :
469
556
metrics .append (
470
557
Metric (
471
- key = None ,
558
+ key = final_metric_key ,
472
559
name = "sync_speed_docs_per_min" ,
473
560
value = sync_speed ,
474
561
tags = {
@@ -482,9 +569,6 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
482
569
f"Invalid sync record { sync_record .id } with no duration"
483
570
)
484
571
485
- # Mark final metrics as emitted so we don't re-emit
486
- _mark_metric_as_emitted (redis_std , final_metric_key )
487
-
488
572
# Emit start latency
489
573
start_latency_key = _SYNC_START_LATENCY_KEY_FMT .format (
490
574
sync_type = sync_record .sync_type ,
@@ -502,22 +586,20 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
502
586
entity = db_session .scalar (
503
587
select (UserGroup ).where (UserGroup .id == sync_record .entity_id )
504
588
)
505
- else :
506
- task_logger .info (
507
- f"Skipping sync record { sync_record .id } of type { sync_record .sync_type } ."
508
- )
509
- continue
510
589
511
590
if entity is None :
512
591
task_logger .error (
513
- f"Could not find entity for sync record { sync_record .id } "
514
- f"(type= { sync_record . sync_type } , id={ sync_record .entity_id } )."
592
+ f"Sync record of type { sync_record .sync_type } doesn't have an entity "
593
+ f"associated with it ( id={ sync_record .entity_id } ). Skipping start latency metric ."
515
594
)
516
- continue
517
595
518
596
# Calculate start latency in seconds:
519
597
# (actual sync start) - (last modified time)
520
- if entity .time_last_modified_by_user and sync_record .sync_start_time :
598
+ if (
599
+ entity is not None
600
+ and entity .time_last_modified_by_user
601
+ and sync_record .sync_start_time
602
+ ):
521
603
start_latency = (
522
604
sync_record .sync_start_time - entity .time_last_modified_by_user
523
605
).total_seconds ()
@@ -541,8 +623,6 @@ def _collect_sync_metrics(db_session: Session, redis_std: Redis) -> list[Metric]
541
623
)
542
624
)
543
625
544
- _mark_metric_as_emitted (redis_std , start_latency_key )
545
-
546
626
return metrics
547
627
548
628
@@ -607,9 +687,12 @@ def monitor_background_processes(self: Task, *, tenant_id: str | None) -> None:
607
687
for metric_fn in metric_functions :
608
688
metrics = metric_fn ()
609
689
for metric in metrics :
610
- metric .log ()
611
- metric .emit (tenant_id )
612
- if metric .key :
690
+ # double check to make sure we aren't double-emitting metrics
691
+ if metric .key is not None and not _has_metric_been_emitted (
692
+ redis_std , metric .key
693
+ ):
694
+ metric .log ()
695
+ metric .emit (tenant_id )
613
696
_mark_metric_as_emitted (redis_std , metric .key )
614
697
615
698
task_logger .info ("Successfully collected background metrics" )
0 commit comments