23
23
QueryResponseContentDB ,
24
24
ResponseFeedbackDB ,
25
25
)
26
- from app .urgency_detection .models import UrgencyQueryDB
26
+ from app .urgency_detection .models import UrgencyQueryDB , UrgencyResponseDB
27
27
from app .users .models import UserDB
28
28
from app .utils import get_key_hash
29
29
from litellm import completion
50
50
(ContentFeedbackDB , "feedback_datetime_utc" ),
51
51
(QueryResponseContentDB , "created_datetime_utc" ),
52
52
(UrgencyQueryDB , "message_datetime_utc" ),
53
+ (UrgencyResponseDB , "response_datetime_utc" ),
53
54
]
54
55
55
56
parser = argparse .ArgumentParser (
64
65
--api-key <API_KEY> \
65
66
--nb-workers 8 \
66
67
--start-date 01-08-23
68
+ --end-date 04-09-24
67
69
68
70
""" ,
69
71
)
82
84
help = "Start date for the records in the format dd-mm-yy" ,
83
85
required = False ,
84
86
)
87
+ parser .add_argument (
88
+ "--end-date" ,
89
+ help = "End date for the records in the format dd-mm-yy" ,
90
+ required = False ,
91
+ )
92
+ parser .add_argument (
93
+ "--subset" ,
94
+ help = "Subset of the data to use for testing" ,
95
+ required = False ,
96
+ )
85
97
args = parser .parse_args ()
86
98
87
99
@@ -281,24 +293,60 @@ def process_urgency_detection(_id: int, text: str) -> tuple | None:
281
293
return None
282
294
283
295
284
- def create_random_datetime_from_string (start_date : datetime ) -> datetime :
296
+ def create_random_datetime (start_date : datetime , end_date : datetime ) -> datetime :
285
297
"""
286
- Create a random datetime from a date in the format "%d-%m-%y
287
- to today
298
+ Create a random datetime from a date within a range
288
299
"""
289
300
290
- time_difference = datetime . now () - start_date
301
+ time_difference = end_date - start_date
291
302
random_number_of_days = random .randint (0 , time_difference .days )
292
303
293
- random_number_of_seconds = random .randint (0 , 86399 ) # Number of seconds in one day
294
-
304
+ random_number_of_seconds = random .randint (0 , 86399 )
295
305
random_datetime = start_date + timedelta (
296
306
days = random_number_of_days , seconds = random_number_of_seconds
297
307
)
298
308
return random_datetime
299
309
300
310
301
- def update_date_of_records (models : list , random_dates : list , api_key : str ) -> None :
311
+ def is_within_time_range (date : datetime ) -> bool :
312
+ """
313
+ Helper function to check if the date is within desired time range.
314
+ Prioritizing 9am-12pm and 8pm-10pm
315
+ """
316
+ if 9 <= date .hour < 12 or 20 <= date .hour < 22 :
317
+ return True
318
+ return False
319
+
320
+
321
+ def generate_distributed_dates (n : int , start : datetime , end : datetime ) -> list :
322
+ """
323
+ Generate dates with a specific distribution for the records
324
+ """
325
+ dates : list [datetime ] = []
326
+ while len (dates ) < n :
327
+ date = create_random_datetime (start , end )
328
+
329
+ # More dates on weekends
330
+ if date .weekday () >= 5 :
331
+
332
+ if (
333
+ is_within_time_range (date ) or random .random () < 0.4
334
+ ): # Within time range or 30% chance
335
+ dates .append (date )
336
+ else :
337
+ if random .random () < 0.6 :
338
+ if is_within_time_range (date ) or random .random () < 0.55 :
339
+ dates .append (date )
340
+
341
+ return dates
342
+
343
+
344
+ def update_date_of_records (
345
+ models : list ,
346
+ api_key : str ,
347
+ start_date : datetime ,
348
+ end_date : datetime ,
349
+ ) -> None :
302
350
"""
303
351
Update the date of the records in the database
304
352
"""
@@ -308,11 +356,7 @@ def update_date_of_records(models: list, random_dates: list, api_key: str) -> No
308
356
select (UserDB ).where (UserDB .hashed_api_key == hashed_token )
309
357
).scalar_one ()
310
358
queries = [c for c in session .query (QueryDB ).all () if c .user_id == user .user_id ]
311
- if len (queries ) > len (random_dates ):
312
- random_dates = random_dates + [
313
- create_random_datetime_from_string (start_date )
314
- for _ in range (len (queries ) - len (random_dates ))
315
- ]
359
+ random_dates = generate_distributed_dates (len (queries ), start_date , end_date )
316
360
# Create a dictionary to map the query_id to the random date
317
361
date_map_dic = {queries [i ].query_id : random_dates [i ] for i in range (len (queries ))}
318
362
for model in models :
@@ -323,8 +367,8 @@ def update_date_of_records(models: list, random_dates: list, api_key: str) -> No
323
367
324
368
for i , row in enumerate (rows ):
325
369
# Set the date attribute to the random date
326
- if hasattr (row , "query_id" ):
327
- date = date_map_dic [ row .query_id ]
370
+ if hasattr (row , "query_id" ) and model [ 0 ] != UrgencyQueryDB :
371
+ date = date_map_dic . get ( row .query_id , None )
328
372
else :
329
373
date = random_dates [i ]
330
374
@@ -351,17 +395,26 @@ def update_date_of_contents(date: datetime) -> None:
351
395
NB_WORKERS = int (args .nb_workers ) if args .nb_workers else 8
352
396
API_KEY = args .api_key if args .api_key else ADMIN_API_KEY
353
397
354
- date_string = args .start_date if args .start_date else "01-08-23"
398
+ start_date_string = args .start_date if args .start_date else "01-08-23"
399
+ end_date_string = args .end_date if args .end_date else None
355
400
date_format = "%d-%m-%y"
356
- start_date = datetime .strptime (date_string , date_format )
401
+ start_date = datetime .strptime (start_date_string , date_format )
402
+ end_date = (
403
+ datetime .strptime (end_date_string , date_format )
404
+ if end_date_string
405
+ else datetime .now ()
406
+ )
407
+ assert end_date , "Invalid end date. Please provide a valid date. Format is dd-mm-yy"
357
408
assert (
358
- start_date and start_date < datetime . now ()
359
- ), "Invalid start date. Please provide a valid start date."
409
+ start_date and start_date < end_date
410
+ ), "Invalid start date. Please provide a valid start date. Format is dd-mm-yy "
360
411
412
+ subset = int (args .subset ) if args .subset else None
361
413
path = args .csv
362
- df = pd .read_csv (path )
414
+ df = pd .read_csv (path , nrows = subset )
363
415
saved_queries = defaultdict (list )
364
416
print ("Processing search queries..." )
417
+
365
418
# Using multithreading to speed up the process
366
419
with ThreadPoolExecutor (max_workers = NB_WORKERS ) as executor :
367
420
future_to_text = {
@@ -444,11 +497,8 @@ def update_date_of_contents(date: datetime) -> None:
444
497
result = future .result ()
445
498
print ("Urgency Detection successfully processed" )
446
499
447
- random_dates = [
448
- create_random_datetime_from_string (start_date ) for _ in range (len (df ))
449
- ]
450
500
print ("Updating the date of the records..." )
451
- update_date_of_records (MODELS , random_dates , API_KEY )
501
+ update_date_of_records (MODELS , API_KEY , start_date , end_date )
452
502
453
503
print ("Updating the date of the content records..." )
454
504
update_date_of_contents (start_date )
0 commit comments