Skip to content

Commit d973363

Browse files
Wenxi OnyxWenxi Onyx
authored andcommitted
embedding changes for hackathon
1 parent 013bed3 commit d973363

File tree

6 files changed

+312
-8
lines changed

6 files changed

+312
-8
lines changed

backend/onyx/chat/answer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,13 @@ def processed_streamed_output(self) -> AnswerStream:
366366

367367
if _HACKATHON_TEST_EXECUTION:
368368

369+
# Enable search-only mode for hackathon test execution
370+
self.graph_config.behavior.skip_gen_ai_answer_generation = True
371+
# Disable reranking for faster processing
372+
self.graph_config.behavior.allow_agent_reranking = False
373+
self.graph_config.behavior.allow_refinement = False
374+
375+
# Disable query rewording for more predictable results
369376
input_data = str(self.graph_config.inputs.prompt_builder.raw_user_query)
370377

371378
if input_data.startswith("["):

backend/onyx/chat/process_message.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,6 +1012,7 @@ def create_response(
10121012
tools=tools,
10131013
db_session=db_session,
10141014
use_agentic_search=new_msg_req.use_agentic_search,
1015+
skip_gen_ai_answer_generation=new_msg_req.skip_gen_ai_answer_generation,
10151016
)
10161017

10171018
info_by_subq: dict[SubQuestionKey, AnswerPostInfo] = defaultdict(

backend/onyx/connectors/fireflies/connector.py

Lines changed: 138 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
import os
12
from collections.abc import Iterator
23
from datetime import datetime
34
from datetime import timezone
45
from typing import cast
56
from typing import List
67

8+
import openai
79
import requests
10+
from pydantic import BaseModel
811

912
from onyx.configs.app_configs import INDEX_BATCH_SIZE
1013
from onyx.configs.constants import DocumentSource
@@ -48,6 +51,133 @@
4851
ONE_MINUTE = 60
4952

5053

54+
class DocumentClassificationResult(BaseModel):
55+
categories: list[str]
56+
entities: list[str]
57+
58+
59+
def _extract_categories_and_entities(
60+
sections: list[TextSection | ImageSection],
61+
) -> dict[str, list[str]]:
62+
"""Extract categories and entities from document sections with retry logic."""
63+
import time
64+
import random
65+
66+
prompt = """
67+
Analyze this document, classify it with categories, and extract important entities.
68+
69+
CATEGORIES:
70+
Create up to 5 simple categories that best capture what this document is about. Consider categories within:
71+
- Document type (e.g., Manual, Report, Email, Transcript, etc.)
72+
- Content domain (e.g., Technical, Financial, HR, Marketing, etc.)
73+
- Purpose (e.g., Training, Reference, Announcement, Analysis, etc.)
74+
- Industry/Topic area (e.g., Software Development, Sales, Legal, etc.)
75+
76+
Be creative and specific. Use clear, descriptive terms that someone searching for this document might use.
77+
Categories should be up to 2 words each.
78+
79+
ENTITIES:
80+
Extract up to 5 important proper nouns, such as:
81+
- Company names (e.g., Microsoft, Google, Acme Corp)
82+
- Product names (e.g., Office 365, Salesforce, iPhone)
83+
- People's names (e.g. John, Jane, Ahmed, Wenjie, etc.)
84+
- Department names (e.g., Engineering, Marketing, HR)
85+
- Project names (e.g., Project Alpha, Migration 2024)
86+
- Technology names (e.g., PostgreSQL, React, AWS)
87+
- Location names (e.g., New York Office, Building A)
88+
"""
89+
90+
# Retry configuration
91+
max_retries = 3
92+
base_delay = 1.0 # seconds
93+
backoff_factor = 2.0
94+
95+
for attempt in range(max_retries + 1):
96+
try:
97+
api_key = os.getenv("OPENAI_API_KEY")
98+
if not api_key:
99+
logger.warning("OPENAI_API_KEY not set, skipping metadata extraction")
100+
return {"categories": [], "entities": []}
101+
102+
client = openai.OpenAI(api_key=api_key)
103+
104+
# Combine all section text
105+
document_text = "\n\n".join(
106+
[
107+
section.text
108+
for section in sections
109+
if isinstance(section, TextSection) and section.text.strip()
110+
]
111+
)
112+
113+
# Skip if no text content
114+
if not document_text.strip():
115+
logger.debug("No text content found, skipping metadata extraction")
116+
return {"categories": [], "entities": []}
117+
118+
# Truncate very long documents to avoid token limits
119+
max_chars = 50000 # Roughly 12k tokens
120+
if len(document_text) > max_chars:
121+
document_text = document_text[:max_chars] + "..."
122+
logger.debug(f"Truncated document text to {max_chars} characters")
123+
124+
response = client.responses.parse(
125+
model="o3",
126+
input=[
127+
{
128+
"role": "system",
129+
"content": "Extract categories and entities from the document.",
130+
},
131+
{
132+
"role": "user",
133+
"content": prompt + "\n\nDOCUMENT: " + document_text,
134+
},
135+
],
136+
text_format=DocumentClassificationResult,
137+
)
138+
139+
classification_result = response.output_parsed
140+
141+
result = {
142+
"categories": classification_result.categories,
143+
"entities": classification_result.entities,
144+
}
145+
146+
logger.debug(f"Successfully extracted metadata: {result}")
147+
return result
148+
149+
except Exception as e:
150+
attempt_num = attempt + 1
151+
is_last_attempt = attempt == max_retries
152+
153+
# Log the error
154+
if is_last_attempt:
155+
logger.error(
156+
f"Failed to extract categories and entities after {max_retries + 1} attempts: {e}"
157+
)
158+
else:
159+
logger.warning(
160+
f"Attempt {attempt_num} failed to extract metadata: {e}. Retrying..."
161+
)
162+
163+
# If this is the last attempt, return empty results
164+
if is_last_attempt:
165+
return {"categories": [], "entities": []}
166+
167+
# Calculate delay with exponential backoff and jitter
168+
delay = base_delay * (backoff_factor**attempt)
169+
jitter = random.uniform(0.1, 0.3) # Add 10-30% jitter
170+
total_delay = delay + jitter
171+
172+
logger.debug(
173+
f"Waiting {total_delay:.2f} seconds before retry {attempt_num + 1}"
174+
)
175+
time.sleep(total_delay)
176+
177+
# Should never reach here, but just in case
178+
return {"categories": [], "entities": []}
179+
180+
51181
def _create_doc_from_transcript(transcript: dict) -> Document | None:
52182
sections: List[TextSection] = []
53183
current_speaker_name = None
@@ -96,12 +226,19 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
96226
if participant != meeting_organizer_email and participant:
97227
meeting_participants_email_list.append(BasicExpertInfo(email=participant))
98228

229+
# Extract categories and entities from transcript and store in metadata
230+
categories_and_entities = _extract_categories_and_entities(sections)
231+
metadata = {
232+
"categories": categories_and_entities.get("categories", []),
233+
"entities": categories_and_entities.get("entities", []),
234+
}
235+
99236
return Document(
100237
id=fireflies_id,
101238
sections=cast(list[TextSection | ImageSection], sections),
102239
source=DocumentSource.FIREFLIES,
103240
semantic_identifier=meeting_title,
104-
metadata={},
241+
metadata=metadata,
105242
doc_updated_at=meeting_date,
106243
primary_owners=organizer_email_user_info,
107244
secondary_owners=meeting_participants_email_list,

backend/onyx/connectors/google_drive/doc_conversion.py

Lines changed: 141 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import io
2+
import os
23
from collections.abc import Callable
34
from datetime import datetime
45
from typing import Any
56
from typing import cast
67

8+
import openai
79
from googleapiclient.errors import HttpError # type: ignore
810
from googleapiclient.http import MediaIoBaseDownload # type: ignore
911
from pydantic import BaseModel
@@ -45,6 +47,12 @@
4547

4648
logger = setup_logger()
4749

50+
51+
class DocumentClassificationResult(BaseModel):
52+
categories: list[str]
53+
entities: list[str]
54+
55+
4856
# This is not a standard valid unicode char, it is used by the docs advanced API to
4957
# represent smart chips (elements like dates and doc links).
5058
SMART_CHIP_CHAR = "\ue907"
@@ -406,6 +414,128 @@ def convert_drive_item_to_document(
406414
return first_error
407415

408416

417+
def _extract_categories_and_entities(
418+
sections: list[TextSection | ImageSection],
419+
) -> dict[str, list[str]]:
420+
"""Extract categories and entities from document sections with retry logic."""
421+
import time
422+
import random
423+
424+
prompt = """
425+
Analyze this document, classify it with categories, and extract important entities.
426+
427+
CATEGORIES:
428+
Create up to 5 simple categories that best capture what this document is about. Consider categories within:
429+
- Document type (e.g., Manual, Report, Email, Transcript, etc.)
430+
- Content domain (e.g., Technical, Financial, HR, Marketing, etc.)
431+
- Purpose (e.g., Training, Reference, Announcement, Analysis, etc.)
432+
- Industry/Topic area (e.g., Software Development, Sales, Legal, etc.)
433+
434+
Be creative and specific. Use clear, descriptive terms that someone searching for this document might use.
435+
Categories should be up to 2 words each.
436+
437+
ENTITIES:
438+
Extract up to 5 important proper nouns, such as:
439+
- Company names (e.g., Microsoft, Google, Acme Corp)
440+
- Product names (e.g., Office 365, Salesforce, iPhone)
441+
- People's names (e.g. John, Jane, Ahmed, Wenjie, etc.)
442+
- Department names (e.g., Engineering, Marketing, HR)
443+
- Project names (e.g., Project Alpha, Migration 2024)
444+
- Technology names (e.g., PostgreSQL, React, AWS)
445+
- Location names (e.g., New York Office, Building A)
446+
"""
447+
448+
# Retry configuration
449+
max_retries = 3
450+
base_delay = 1.0 # seconds
451+
backoff_factor = 2.0
452+
453+
for attempt in range(max_retries + 1):
454+
try:
455+
api_key = os.getenv("OPENAI_API_KEY")
456+
if not api_key:
457+
logger.warning("OPENAI_API_KEY not set, skipping metadata extraction")
458+
return {"categories": [], "entities": []}
459+
460+
client = openai.OpenAI(api_key=api_key)
461+
462+
# Combine all section text
463+
document_text = "\n\n".join(
464+
[
465+
section.text
466+
for section in sections
467+
if isinstance(section, TextSection) and section.text.strip()
468+
]
469+
)
470+
471+
# Skip if no text content
472+
if not document_text.strip():
473+
logger.debug("No text content found, skipping metadata extraction")
474+
return {"categories": [], "entities": []}
475+
476+
# Truncate very long documents to avoid token limits
477+
max_chars = 50000 # Roughly 12k tokens
478+
if len(document_text) > max_chars:
479+
document_text = document_text[:max_chars] + "..."
480+
logger.debug(f"Truncated document text to {max_chars} characters")
481+
482+
response = client.responses.parse(
483+
model="o3",
484+
input=[
485+
{
486+
"role": "system",
487+
"content": "Extract categories and entities from the document.",
488+
},
489+
{
490+
"role": "user",
491+
"content": prompt + "\n\nDOCUMENT: " + document_text,
492+
},
493+
],
494+
text_format=DocumentClassificationResult,
495+
)
496+
497+
classification_result = response.output_parsed
498+
499+
result = {
500+
"categories": classification_result.categories,
501+
"entities": classification_result.entities,
502+
}
503+
504+
logger.debug(f"Successfully extracted metadata: {result}")
505+
return result
506+
507+
except Exception as e:
508+
attempt_num = attempt + 1
509+
is_last_attempt = attempt == max_retries
510+
511+
# Log the error
512+
if is_last_attempt:
513+
logger.error(
514+
f"Failed to extract categories and entities after {max_retries + 1} attempts: {e}"
515+
)
516+
else:
517+
logger.warning(
518+
f"Attempt {attempt_num} failed to extract metadata: {e}. Retrying..."
519+
)
520+
521+
# If this is the last attempt, return empty results
522+
if is_last_attempt:
523+
return {"categories": [], "entities": []}
524+
525+
# Calculate delay with exponential backoff and jitter
526+
delay = base_delay * (backoff_factor**attempt)
527+
jitter = random.uniform(0.1, 0.3) # Add 10-30% jitter
528+
total_delay = delay + jitter
529+
530+
logger.debug(
531+
f"Waiting {total_delay:.2f} seconds before retry {attempt_num + 1}"
532+
)
533+
time.sleep(total_delay)
534+
535+
# Should never reach here, but just in case
536+
return {"categories": [], "entities": []}
537+
538+
409539
def _convert_drive_item_to_document(
410540
creds: Any,
411541
allow_images: bool,
@@ -499,17 +629,23 @@ def _get_docs_service() -> GoogleDocsService:
499629
else None
500630
)
501631

632+
# Extract categories and entities from drive item and store in metadata
633+
categories_and_entities = _extract_categories_and_entities(sections)
634+
metadata = {
635+
"owner_names": ", ".join(
636+
owner.get("displayName", "") for owner in file.get("owners", [])
637+
),
638+
"categories": categories_and_entities.get("categories", []),
639+
"entities": categories_and_entities.get("entities", []),
640+
}
641+
502642
# Create the document
503643
return Document(
504644
id=doc_id,
505645
sections=sections,
506646
source=DocumentSource.GOOGLE_DRIVE,
507647
semantic_identifier=file.get("name", ""),
508-
metadata={
509-
"owner_names": ", ".join(
510-
owner.get("displayName", "") for owner in file.get("owners", [])
511-
),
512-
},
648+
metadata=metadata,
513649
doc_updated_at=datetime.fromisoformat(
514650
file.get("modifiedTime", "").replace("Z", "+00:00")
515651
),

0 commit comments

Comments
 (0)