Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions backend/onyx/kg/extractions/extraction_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@
from onyx.kg.models import KGEntityTypeInstructions
from onyx.kg.models import KGExtractionInstructions
from onyx.kg.utils.extraction_utils import EntityTypeMetadataTracker
from onyx.kg.utils.extraction_utils import is_email
from onyx.kg.utils.extraction_utils import (
kg_document_entities_relationships_attribute_generation,
)
Expand All @@ -54,6 +53,7 @@
from onyx.kg.utils.extraction_utils import prepare_llm_document_content
from onyx.kg.utils.extraction_utils import trackinfo_to_str
from onyx.kg.utils.formatting_utils import aggregate_kg_extractions
from onyx.kg.utils.formatting_utils import extract_email
from onyx.kg.utils.formatting_utils import extract_relationship_type_id
from onyx.kg.utils.formatting_utils import generalize_entities
from onyx.kg.utils.formatting_utils import get_entity_type
Expand Down Expand Up @@ -1045,14 +1045,14 @@ def process_single_chunk(
if chunk.metadata:
for attribute, value in chunk.metadata.items():
if isinstance(value, str):
if is_email(value):
if email := extract_email(value):
(
implied_attribute_entities,
implied_attribute_relationships,
attribute_company_participant_emails,
attribute_account_participant_emails,
) = kg_process_person(
person=value,
person=email,
core_document_id_name=kg_document_extractions.kg_core_document_id_name,
implied_entities=implied_attribute_entities,
implied_relationships=implied_attribute_relationships,
Expand All @@ -1069,14 +1069,14 @@ def process_single_chunk(
elif isinstance(value, list):
email_attribute = False
for item in value:
if is_email(item):
if email := extract_email(item):
(
implied_attribute_entities,
implied_attribute_relationships,
attribute_company_participant_emails,
attribute_account_participant_emails,
) = kg_process_person(
person=item,
person=email,
core_document_id_name=kg_document_extractions.kg_core_document_id_name,
implied_entities=implied_attribute_entities,
implied_relationships=implied_attribute_relationships,
Expand Down
11 changes: 2 additions & 9 deletions backend/onyx/kg/utils/extraction_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re
from collections import defaultdict

from onyx.configs.constants import OnyxCallTypes
Expand All @@ -18,6 +17,7 @@
from onyx.kg.models import KGDocumentEntitiesRelationshipsAttributes
from onyx.kg.models import KGEnhancedDocumentMetadata
from onyx.kg.models import KGEntityTypeClassificationInfo
from onyx.kg.utils.formatting_utils import extract_email
from onyx.kg.utils.formatting_utils import generalize_entities
from onyx.kg.utils.formatting_utils import kg_email_processing
from onyx.kg.utils.formatting_utils import make_entity_id
Expand All @@ -41,7 +41,7 @@ def _update_implied_entities_relationships(

for owner in owner_list or []:

if not is_email(owner):
if extract_email(owner) is None:
converted_relationships_to_attributes[relationship_type].append(owner)
continue

Expand Down Expand Up @@ -433,13 +433,6 @@ def prepare_llm_document_content(
)


def is_email(email: str) -> bool:
"""
Check if a string is a valid email address.
"""
return re.match(r"[^@]+@[^@]+\.[^@]+", email) is not None


def trackinfo_to_str(trackinfo: KGAttributeTrackInfo | None) -> str:
"""Convert trackinfo to an LLM friendly string"""
if trackinfo is None:
Expand Down
13 changes: 12 additions & 1 deletion backend/onyx/kg/utils/formatting_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,15 @@ def aggregate_kg_extractions(
return aggregated_kg_extractions


def extract_email(email: str) -> str | None:
"""
Extract an email from an arbitrary string (if any).
Only the first email is returned.
"""
match = re.search(r"([A-Za-z0-9._+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)+)", email)
return match.group(0) if match else None


def kg_email_processing(email: str, kg_config_settings: KGConfigSettings) -> KGPerson:
"""
Process the email.
Expand All @@ -173,7 +182,9 @@ def kg_email_processing(email: str, kg_config_settings: KGConfigSettings) -> KGP
if employee:
company = kg_config_settings.KG_VENDOR
else:
company = company_domain.capitalize()
# TODO: maybe store a list of domains for each account and use that to match
# right now, gmail and other random domains are being converted into accounts
company = company_domain.title()

return KGPerson(name=name, company=company, employee=employee)

Expand Down