|
7 | 7 | import numpy as np
|
8 | 8 | from rapidfuzz.fuzz import ratio
|
9 | 9 | from sklearn.cluster import SpectralClustering # type: ignore
|
10 |
| -from sqlalchemy import text |
11 |
| -from thefuzz import fuzz # type: ignore |
12 | 10 |
|
13 | 11 | from onyx.db.document import update_document_kg_info
|
14 | 12 | from onyx.db.engine import get_session_with_current_tenant
|
15 | 13 | from onyx.db.entities import add_entity
|
| 14 | +from onyx.db.entities import delete_entities_by_id_names |
16 | 15 | from onyx.db.entities import get_entities_by_grounding
|
17 |
| -from onyx.db.entities import KGEntityExtractionStaging |
18 | 16 | from onyx.db.entity_type import get_determined_grounded_entity_types
|
19 | 17 | from onyx.db.relationships import add_relationship
|
20 | 18 | from onyx.db.relationships import add_relationship_type
|
@@ -357,17 +355,15 @@ def _match_ungrounded_ge_entities(
|
357 | 355 |
|
358 | 356 | # Try fuzzy matching with grounded entities
|
359 | 357 | for grounded_entity in grounded_list:
|
360 |
| - score = fuzz.ratio( |
361 |
| - ungrounded_entity.lower(), grounded_entity.lower() |
362 |
| - ) |
| 358 | + score = ratio(ungrounded_entity.lower(), grounded_entity.lower()) |
363 | 359 | if score > fuzzy_match_threshold and score > best_score:
|
364 | 360 | best_match = grounded_entity
|
365 | 361 | best_score = score
|
366 | 362 |
|
367 | 363 | # Try fuzzy matching with previously processed ungrounded entities
|
368 | 364 | if not best_match:
|
369 | 365 | for processed_entity in processed_entities[entity_type]:
|
370 |
| - score = fuzz.ratio( |
| 366 | + score = ratio( |
371 | 367 | ungrounded_entity.lower(), processed_entity.lower()
|
372 | 368 | )
|
373 | 369 | if score > fuzzy_match_threshold and score > best_score:
|
@@ -440,9 +436,7 @@ def _match_determined_ge_entities(
|
440 | 436 |
|
441 | 437 | # Try fuzzy matching with grounded entities
|
442 | 438 | for grounded_entity in determined_entities_list:
|
443 |
| - score = fuzz.ratio( |
444 |
| - ungrounded_entity.lower(), grounded_entity.lower() |
445 |
| - ) |
| 439 | + score = ratio(ungrounded_entity.lower(), grounded_entity.lower()) |
446 | 440 | if score > fuzzy_match_threshold and score > best_score:
|
447 | 441 | best_match = grounded_entity
|
448 | 442 | best_score = score
|
@@ -504,86 +498,42 @@ def kg_clustering(
|
504 | 498 |
|
505 | 499 | relationships = get_all_relationships(db_session, kg_stage=KGStage.EXTRACTED)
|
506 | 500 |
|
507 |
| - grounded_entities: set[KGEntityExtractionStaging] = set( |
508 |
| - get_entities_by_grounding( |
509 |
| - db_session, KGStage.EXTRACTED, KGGroundingType.GROUNDED |
510 |
| - ) |
| 501 | + grounded_entities = get_entities_by_grounding( |
| 502 | + db_session, KGStage.EXTRACTED, KGGroundingType.GROUNDED |
511 | 503 | )
|
512 | 504 |
|
513 | 505 | ## Clustering
|
514 | 506 |
|
515 | 507 | # TODO: re-implement clustering of ungrounded entities as well as
|
516 |
| - # grounded entities that do not have a source document with deep extraction enabled! |
517 |
| - # For now we would just dedupe grounded entities that have very similar names |
| 508 | + # grounded entities that do not have a source document with deep extraction |
| 509 | + # enabled! |
| 510 | + # For now we would just create a trivial entity mapping from the |
| 511 | + # 'unclustered' entities to the 'clustered' entities. So we can simply |
| 512 | + # transfer the entity information from the Staging to the Normalized |
| 513 | + # tables. |
518 | 514 | # This will be reimplemented when deep extraction is enabled.
|
519 | 515 |
|
520 |
| - THRESHOLD = 96 |
521 |
| - while grounded_entities: |
522 |
| - clustered: list[str] = [] |
523 |
| - |
524 |
| - entity = grounded_entities.pop() |
525 |
| - clustered.append(entity.id_name) |
| 516 | + ## Database operations |
526 | 517 |
|
527 |
| - primary_entity = entity |
528 |
| - occurrences = entity.occurrences or 1 |
529 |
| - names: set[str] = {entity.name} |
| 518 | + # create the clustered objects - entities |
530 | 519 |
|
531 |
| - # find a list of entities with very similar names |
| 520 | + transferred_entities: list[str] = [] |
| 521 | + for grounded_entity in grounded_entities: |
532 | 522 | with get_session_with_current_tenant() as db_session:
|
533 |
| - # uses GIN index, very efficient |
534 |
| - db_session.execute(text("SET pg_trgm.similarity_threshold = 0.6")) |
535 |
| - similar_entities = ( |
536 |
| - db_session.query(KGEntityExtractionStaging) |
537 |
| - .filter( |
538 |
| - KGEntityExtractionStaging.entity_type_id_name |
539 |
| - == entity.entity_type_id_name, |
540 |
| - ~KGEntityExtractionStaging.clustered, |
541 |
| - KGEntityExtractionStaging.id_name != entity.id_name, |
542 |
| - KGEntityExtractionStaging.clustering_name.op("%")( |
543 |
| - entity.clustering_name |
544 |
| - ), |
545 |
| - ) |
546 |
| - .all() |
547 |
| - ) |
548 |
| - for similar in similar_entities: |
549 |
| - # skip those with number so we don't cluster version1 and version2 |
550 |
| - if any(char.isdigit() for char in similar.clustering_name): |
551 |
| - continue |
552 |
| - if ratio(similar.clustering_name, entity.clustering_name) > THRESHOLD: |
553 |
| - if similar in grounded_entities: |
554 |
| - grounded_entities.remove(similar) |
555 |
| - clustered.append(similar.id_name) |
556 |
| - names.add(similar.name) |
557 |
| - occurrences += similar.occurrences or 1 |
558 |
| - |
559 |
| - if ( |
560 |
| - primary_entity.document_id is None |
561 |
| - and similar.document_id is not None |
562 |
| - ): |
563 |
| - primary_entity = similar |
564 |
| - |
565 |
| - # only keep the primary entity |
566 |
| - names.remove(entity.name) |
567 | 523 | added_entity = add_entity(
|
568 | 524 | db_session,
|
569 | 525 | KGStage.NORMALIZED,
|
570 |
| - entity_type=primary_entity.entity_type_id_name, |
571 |
| - name=primary_entity.name, |
572 |
| - occurrences=occurrences, |
573 |
| - document_id=primary_entity.document_id, |
574 |
| - attributes=primary_entity.attributes or None, |
575 |
| - alternative_names=list(names), |
| 526 | + entity_type=grounded_entity.entity_type_id_name, |
| 527 | + name=grounded_entity.name, |
| 528 | + occurrences=grounded_entity.occurrences or 1, |
| 529 | + document_id=grounded_entity.document_id or None, |
| 530 | + attributes=grounded_entity.attributes or None, |
576 | 531 | )
|
577 |
| - if added_entity: |
578 |
| - db_session.query(KGEntityExtractionStaging).filter( |
579 |
| - KGEntityExtractionStaging.id_name.in_(clustered) |
580 |
| - ).update({"clustered": True}) |
581 |
| - db_session.commit() |
582 | 532 |
|
583 |
| - 0 / 0 |
584 |
| - # TODO: delete all the clustered ones |
| 533 | + db_session.commit() |
585 | 534 |
|
586 |
| - ## Database operations |
| 535 | + if added_entity: |
| 536 | + transferred_entities.append(added_entity.id_name) |
587 | 537 |
|
588 | 538 | transferred_relationship_types: list[str] = []
|
589 | 539 | for relationship_type in relationship_types:
|
@@ -656,14 +606,14 @@ def kg_clustering(
|
656 | 606 | except Exception as e:
|
657 | 607 | logger.error(f"Error deleting relationship types: {e}")
|
658 | 608 |
|
659 |
| - # try: |
660 |
| - # with get_session_with_current_tenant() as db_session: |
661 |
| - # delete_entities_by_id_names( |
662 |
| - # db_session, transferred_entities, kg_stage=KGStage.EXTRACTED |
663 |
| - # ) |
664 |
| - # db_session.commit() |
665 |
| - # except Exception as e: |
666 |
| - # logger.error(f"Error deleting entities: {e}") |
| 609 | + try: |
| 610 | + with get_session_with_current_tenant() as db_session: |
| 611 | + delete_entities_by_id_names( |
| 612 | + db_session, transferred_entities, kg_stage=KGStage.EXTRACTED |
| 613 | + ) |
| 614 | + db_session.commit() |
| 615 | + except Exception as e: |
| 616 | + logger.error(f"Error deleting entities: {e}") |
667 | 617 |
|
668 | 618 | # Update document kg info
|
669 | 619 |
|
|
0 commit comments