Skip to content

Commit 5c6ad01

Browse files
authored
Fix timeout issue on data inserting script (#411)
* Fix timeout * Update script * Add more checks
1 parent c2ca998 commit 5c6ad01

File tree

1 file changed

+74
-31
lines changed

1 file changed

+74
-31
lines changed

core_backend/add_new_data_to_db.py

Lines changed: 74 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22
import json
33
import random
4+
import time
45
from collections import defaultdict
56
from concurrent.futures import ThreadPoolExecutor, as_completed
67
from datetime import datetime, timedelta
@@ -11,10 +12,17 @@
1112
from app.config import (
1213
LITELLM_API_KEY,
1314
LITELLM_ENDPOINT,
14-
LITELLM_MODEL_DEFAULT,
15+
LITELLM_MODEL_GENERATION,
1516
)
17+
from app.contents.models import ContentDB
1618
from app.database import get_session
17-
from app.question_answer.models import ContentFeedbackDB, QueryDB, ResponseFeedbackDB
19+
from app.llm_call.utils import remove_json_markdown
20+
from app.question_answer.models import (
21+
ContentFeedbackDB,
22+
QueryDB,
23+
QueryResponseContentDB,
24+
ResponseFeedbackDB,
25+
)
1826
from app.urgency_detection.models import UrgencyQueryDB
1927
from app.users.models import UserDB
2028
from app.utils import get_key_hash
@@ -29,6 +37,7 @@
2937

3038
try:
3139
import requests # type: ignore
40+
3241
except ImportError:
3342
print(
3443
"Please install requests library using `pip install requests` "
@@ -39,6 +48,7 @@
3948
(QueryDB, "query_datetime_utc"),
4049
(ResponseFeedbackDB, "feedback_datetime_utc"),
4150
(ContentFeedbackDB, "feedback_datetime_utc"),
51+
(QueryResponseContentDB, "created_datetime_utc"),
4252
(UrgencyQueryDB, "message_datetime_utc"),
4353
]
4454

@@ -106,47 +116,54 @@ def generate_feedback(question_text: str, faq_text: str, sentiment: str) -> dict
106116
"""
107117

108118
response = completion(
109-
model=LITELLM_MODEL_DEFAULT,
119+
model=LITELLM_MODEL_GENERATION,
110120
api_base=LITELLM_ENDPOINT,
111121
api_key=LITELLM_API_KEY,
112122
messages=[{"role": "user", "content": prompt}],
113123
max_tokens=100,
114124
temperature=0.7,
115125
)
116126

117-
# Extract the output from the response
118-
feedback_output = response["choices"][0]["message"]["content"].strip()
119-
feedback_output = feedback_output.replace("json", "")
120-
feedback_output = feedback_output.replace("\n", "").strip()
121-
122127
try:
128+
# Extract the output from the response
129+
feedback_output = response["choices"][0]["message"]["content"].strip()
130+
feedback_output = remove_json_markdown(feedback_output)
123131
feedback_dict = json.loads(feedback_output)
124132
if isinstance(feedback_dict, dict) and "output" in feedback_dict:
125-
126133
return feedback_dict
127134
else:
128135
raise ValueError("Output is not in the correct format.")
129-
except (SyntaxError, ValueError) as e:
136+
except Exception as e:
130137
print(f"Output is not in the correct format.{e}")
131138
return None
132139

133140

134-
def save_single_row(endpoint: str, data: dict) -> dict:
141+
def save_single_row(endpoint: str, data: dict, retries: int = 2) -> dict | None:
135142
"""
136143
Save a single row in the database.
137144
"""
138-
139-
response = requests.post(
140-
endpoint,
141-
headers={
142-
"accept": "application/json",
143-
"Content-Type": "application/json",
144-
"Authorization": f"Bearer {API_KEY}",
145-
},
146-
json=data,
147-
verify=False,
148-
)
149-
return response.json()
145+
try:
146+
response = requests.post(
147+
endpoint,
148+
headers={
149+
"accept": "application/json",
150+
"Content-Type": "application/json",
151+
"Authorization": f"Bearer {API_KEY}",
152+
},
153+
json=data,
154+
verify=False,
155+
)
156+
response.raise_for_status()
157+
return response.json()
158+
159+
except Exception as e:
160+
if retries > 0:
161+
# Implement exponential wait before retrying
162+
time.sleep(2 ** (2 - retries))
163+
return save_single_row(endpoint, data, retries=retries - 1)
164+
else:
165+
print(f"Request failed after retries: {e}")
166+
return None
150167

151168

152169
def process_search(_id: int, text: str) -> tuple | None:
@@ -161,7 +178,7 @@ def process_search(_id: int, text: str) -> tuple | None:
161178
"generate_tts": False,
162179
}
163180
response = save_single_row(endpoint, data)
164-
if "search_results" in response:
181+
if response and isinstance(response, dict) and "search_results" in response:
165182
return (
166183
_id,
167184
response["query_id"],
@@ -215,7 +232,13 @@ def process_content_feedback(
215232
if is_off_topic and feedback_sentiment == "positive":
216233
return None
217234
# randomly get a content from the search results to provide feedback on
218-
content = search_results[str(random.randint(0, 3))]
235+
content_num = str(random.randint(0, 3))
236+
if not search_results or not isinstance(search_results, dict):
237+
return None
238+
if content_num not in search_results:
239+
return None
240+
241+
content = search_results[content_num]
219242

220243
# Get content text and use to generate feedback text using LLMs
221244
content_text = content["title"] + " " + content["text"]
@@ -253,19 +276,16 @@ def process_urgency_detection(_id: int, text: str) -> tuple | None:
253276
}
254277

255278
response = save_single_row(endpoint, data)
256-
if "is_urgent" in response:
279+
if response and "is_urgent" in response:
257280
return (response["is_urgent"],)
258281
return None
259282

260283

261-
def create_random_datetime_from_string(date_string: str) -> datetime:
284+
def create_random_datetime_from_string(start_date: datetime) -> datetime:
262285
"""
263286
Create a random datetime from a date in the format "%d-%m-%y
264287
to today
265288
"""
266-
date_format = "%d-%m-%y"
267-
268-
start_date = datetime.strptime(date_string, date_format)
269289

270290
time_difference = datetime.now() - start_date
271291
random_number_of_days = random.randint(0, time_difference.days)
@@ -296,6 +316,7 @@ def update_date_of_records(models: list, random_dates: list, api_key: str) -> No
296316
# Create a dictionary to map the query_id to the random date
297317
date_map_dic = {queries[i].query_id: random_dates[i] for i in range(len(queries))}
298318
for model in models:
319+
print(f"Updating the date of the records for {model[0].__name__}...")
299320
session = next(get_session())
300321

301322
rows = [c for c in session.query(model[0]).all() if c.user_id == user.user_id]
@@ -312,12 +333,31 @@ def update_date_of_records(models: list, random_dates: list, api_key: str) -> No
312333
session.commit()
313334

314335

336+
def update_date_of_contents(date: datetime) -> None:
337+
"""
338+
Update the date of the content records in the database for consistency
339+
"""
340+
session = next(get_session())
341+
contents = session.query(ContentDB).all()
342+
for content in contents:
343+
content.created_datetime_utc = date
344+
content.updated_datetime_utc = date
345+
session.merge(content)
346+
session.commit()
347+
348+
315349
if __name__ == "__main__":
316350
HOST = args.host
317351
NB_WORKERS = int(args.nb_workers) if args.nb_workers else 8
318352
API_KEY = args.api_key if args.api_key else ADMIN_API_KEY
319353

320-
start_date = args.start_date if args.start_date else "01-08-23"
354+
date_string = args.start_date if args.start_date else "01-08-23"
355+
date_format = "%d-%m-%y"
356+
start_date = datetime.strptime(date_string, date_format)
357+
assert (
358+
start_date and start_date < datetime.now()
359+
), "Invalid start date. Please provide a valid start date."
360+
321361
path = args.csv
322362
df = pd.read_csv(path)
323363
saved_queries = defaultdict(list)
@@ -409,5 +449,8 @@ def update_date_of_records(models: list, random_dates: list, api_key: str) -> No
409449
]
410450
print("Updating the date of the records...")
411451
update_date_of_records(MODELS, random_dates, API_KEY)
452+
453+
print("Updating the date of the content records...")
454+
update_date_of_contents(start_date)
412455
print("All records dates updated successfully.")
413456
print("All records added successfully.")

0 commit comments

Comments
 (0)