Skip to content

Commit ade3c80

Browse files
committed
test and bugfix
1 parent 91e0621 commit ade3c80

File tree

6 files changed

+379
-18
lines changed

6 files changed

+379
-18
lines changed

backend/onyx/db/document.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -937,9 +937,14 @@ def get_unprocessed_kg_document_batch_for_connector(
937937
.where(
938938
and_(
939939
DocumentByConnectorCredentialPair.connector_id == connector_id,
940-
DbDocument.doc_updated_at >= kg_coverage_start,
941-
DbDocument.doc_updated_at
942-
>= datetime.now() - timedelta(days=kg_max_coverage_days),
940+
or_(
941+
DbDocument.doc_updated_at.is_(None),
942+
DbDocument.doc_updated_at
943+
>= max(
944+
kg_coverage_start,
945+
datetime.now() - timedelta(days=kg_max_coverage_days),
946+
),
947+
),
943948
or_(
944949
DbDocument.kg_stage.is_(None),
945950
DbDocument.kg_stage == KGStage.NOT_STARTED,
@@ -1185,14 +1190,17 @@ def check_for_documents_needing_kg_processing(
11851190
.where(
11861191
and_(
11871192
Connector.kg_processing_enabled.is_(True),
1188-
DbDocument.doc_updated_at >= kg_coverage_start,
1189-
DbDocument.doc_updated_at
1190-
>= datetime.now() - timedelta(days=kg_max_coverage_days),
11911193
or_(
1192-
or_(
1193-
DbDocument.kg_stage.is_(None),
1194-
DbDocument.kg_stage == KGStage.NOT_STARTED,
1194+
DbDocument.doc_updated_at.is_(None),
1195+
DbDocument.doc_updated_at
1196+
>= max(
1197+
kg_coverage_start,
1198+
datetime.now() - timedelta(days=kg_max_coverage_days),
11951199
),
1200+
),
1201+
or_(
1202+
DbDocument.kg_stage.is_(None),
1203+
DbDocument.kg_stage == KGStage.NOT_STARTED,
11961204
DbDocument.doc_updated_at > DbDocument.kg_processing_time,
11971205
),
11981206
)

backend/onyx/db/entities.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@ def transfer_entity(
130130
set_=dict(
131131
occurrences=KGEntity.occurrences + entity.occurrences,
132132
attributes=entity.attributes, # attribute can get updated after re-indexing
133+
entity_key=entity.entity_key,
134+
parent_key=entity.parent_key,
133135
event_time=entity.event_time,
134136
time_updated=datetime.now(),
135137
),
@@ -196,6 +198,9 @@ def merge_entities(
196198
document_id=document_id,
197199
alternative_names=list(alternative_names),
198200
occurrences=parent.occurrences + child.occurrences,
201+
attributes=parent.attributes | child.attributes,
202+
entity_key=child.entity_key,
203+
parent_key=child.parent_key,
199204
)
200205
.returning(KGEntity)
201206
)

backend/onyx/kg/utils/extraction_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -451,16 +451,16 @@ def track_metadata(
451451
)
452452
self.entity_attr_info[entity_type][attribute] = trackinfo
453453

454-
# if we see to many different values, we stop tracking
455-
if (
456-
trackinfo.values is None
457-
or len(trackinfo.values) > KG_METADATA_TRACKING_THRESHOLD
458-
):
459-
trackinfo.values = None
454+
# None means marked as don't track
455+
if trackinfo.values is None:
460456
continue
461457

462458
# track the value
463459
if isinstance(value, str):
464460
trackinfo.values.add(value)
465461
else:
466462
trackinfo.values.update(value)
463+
464+
# if we see to many different values, we stop tracking
465+
if len(trackinfo.values) > KG_METADATA_TRACKING_THRESHOLD:
466+
trackinfo.values = None

backend/tests/integration/common_utils/managers/document.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,14 @@ def _generate_dummy_document(
6666
document_id: str,
6767
cc_pair_id: int,
6868
content: str | None = None,
69+
extra_metadata: dict | None = None,
6970
) -> dict:
7071
text = content if content else f"This is test document {document_id}"
72+
73+
metadata: dict = {"document_id": document_id}
74+
if extra_metadata:
75+
metadata.update(extra_metadata)
76+
7177
return {
7278
"document": {
7379
"id": document_id,
@@ -78,8 +84,7 @@ def _generate_dummy_document(
7884
}
7985
],
8086
"source": DocumentSource.NOT_APPLICABLE,
81-
# just for testing metadata
82-
"metadata": {"document_id": document_id},
87+
"metadata": metadata,
8388
"semantic_identifier": f"Test Document {document_id}",
8489
"from_ingestion_api": True,
8590
},
@@ -128,12 +133,18 @@ def seed_doc_with_content(
128133
content: str,
129134
document_id: str | None = None,
130135
api_key: DATestAPIKey | None = None,
136+
metadata: dict | None = None,
131137
) -> SimpleTestDocument:
132138
# Use provided document_ids if available, otherwise generate random UUIDs
133139
if document_id is None:
134140
document_id = f"test-doc-{uuid4()}"
135141
# Create and ingest some documents
136-
document: dict = _generate_dummy_document(document_id, cc_pair.id, content)
142+
document: dict = _generate_dummy_document(
143+
document_id,
144+
cc_pair.id,
145+
content,
146+
extra_metadata=metadata,
147+
)
137148
response = requests.post(
138149
f"{API_SERVER_URL}/onyx-api/ingestion",
139150
json=document,
File renamed without changes.

0 commit comments

Comments
 (0)