diff --git a/backend/onyx/kg/extractions/extraction_processing.py b/backend/onyx/kg/extractions/extraction_processing.py index 2c2afbaa5f8..9f1efc919d4 100644 --- a/backend/onyx/kg/extractions/extraction_processing.py +++ b/backend/onyx/kg/extractions/extraction_processing.py @@ -45,7 +45,6 @@ from onyx.kg.models import KGEntityTypeInstructions from onyx.kg.models import KGExtractionInstructions from onyx.kg.utils.extraction_utils import EntityTypeMetadataTracker -from onyx.kg.utils.extraction_utils import is_email from onyx.kg.utils.extraction_utils import ( kg_document_entities_relationships_attribute_generation, ) @@ -54,6 +53,7 @@ from onyx.kg.utils.extraction_utils import prepare_llm_document_content from onyx.kg.utils.extraction_utils import trackinfo_to_str from onyx.kg.utils.formatting_utils import aggregate_kg_extractions +from onyx.kg.utils.formatting_utils import extract_email from onyx.kg.utils.formatting_utils import extract_relationship_type_id from onyx.kg.utils.formatting_utils import generalize_entities from onyx.kg.utils.formatting_utils import get_entity_type @@ -1045,14 +1045,14 @@ def process_single_chunk( if chunk.metadata: for attribute, value in chunk.metadata.items(): if isinstance(value, str): - if is_email(value): + if email := extract_email(value): ( implied_attribute_entities, implied_attribute_relationships, attribute_company_participant_emails, attribute_account_participant_emails, ) = kg_process_person( - person=value, + person=email, core_document_id_name=kg_document_extractions.kg_core_document_id_name, implied_entities=implied_attribute_entities, implied_relationships=implied_attribute_relationships, @@ -1069,14 +1069,14 @@ def process_single_chunk( elif isinstance(value, list): email_attribute = False for item in value: - if is_email(item): + if email := extract_email(item): ( implied_attribute_entities, implied_attribute_relationships, attribute_company_participant_emails, attribute_account_participant_emails, ) = kg_process_person( - person=item, + person=email, core_document_id_name=kg_document_extractions.kg_core_document_id_name, implied_entities=implied_attribute_entities, implied_relationships=implied_attribute_relationships, diff --git a/backend/onyx/kg/utils/extraction_utils.py b/backend/onyx/kg/utils/extraction_utils.py index 5479de9ea1b..5240e48dc79 100644 --- a/backend/onyx/kg/utils/extraction_utils.py +++ b/backend/onyx/kg/utils/extraction_utils.py @@ -1,4 +1,3 @@ -import re from collections import defaultdict from onyx.configs.constants import OnyxCallTypes @@ -18,6 +17,7 @@ from onyx.kg.models import KGDocumentEntitiesRelationshipsAttributes from onyx.kg.models import KGEnhancedDocumentMetadata from onyx.kg.models import KGEntityTypeClassificationInfo +from onyx.kg.utils.formatting_utils import extract_email from onyx.kg.utils.formatting_utils import generalize_entities from onyx.kg.utils.formatting_utils import kg_email_processing from onyx.kg.utils.formatting_utils import make_entity_id @@ -41,7 +41,7 @@ def _update_implied_entities_relationships( for owner in owner_list or []: - if not is_email(owner): + if extract_email(owner) is None: converted_relationships_to_attributes[relationship_type].append(owner) continue @@ -433,13 +433,6 @@ def prepare_llm_document_content( ) -def is_email(email: str) -> bool: - """ - Check if a string is a valid email address. - """ - return re.match(r"[^@]+@[^@]+\.[^@]+", email) is not None - - def trackinfo_to_str(trackinfo: KGAttributeTrackInfo | None) -> str: """Convert trackinfo to an LLM friendly string""" if trackinfo is None: diff --git a/backend/onyx/kg/utils/formatting_utils.py b/backend/onyx/kg/utils/formatting_utils.py index 8284e7c4677..e0c48f7acc3 100644 --- a/backend/onyx/kg/utils/formatting_utils.py +++ b/backend/onyx/kg/utils/formatting_utils.py @@ -158,6 +158,15 @@ def aggregate_kg_extractions( return aggregated_kg_extractions +def extract_email(email: str) -> str | None: + """ + Extract an email from an arbitrary string (if any). + Only the first email is returned. + """ + match = re.search(r"([A-Za-z0-9._+-]+@[A-Za-z0-9-]+(?:\.[A-Za-z0-9-]+)+)", email) + return match.group(0) if match else None + + def kg_email_processing(email: str, kg_config_settings: KGConfigSettings) -> KGPerson: """ Process the email. @@ -173,7 +182,9 @@ def kg_email_processing(email: str, kg_config_settings: KGConfigSettings) -> KGP if employee: company = kg_config_settings.KG_VENDOR else: - company = company_domain.capitalize() + # TODO: maybe store a list of domains for each account and use that to match + # right now, gmail and other random domains are being converted into accounts + company = company_domain.title() return KGPerson(name=name, company=company, employee=employee)