Skip to content

Commit 18ecc46

Browse files
evan-onyxaronszanto
authored andcommitted
address getting attachments forever (onyx-dot-app#4562)
* address getting attachments forever * fix unit tests
1 parent 22ba502 commit 18ecc46

File tree

2 files changed

+18
-17
lines changed

2 files changed

+18
-17
lines changed

backend/onyx/connectors/confluence/connector.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ def _convert_page_to_document(
265265
# Extract basic page information
266266
page_id = page["id"]
267267
page_title = page["title"]
268+
logger.info(f"Converting page {page_title} to document")
268269
page_url = build_confluence_document_id(
269270
self.wiki_base, page["_links"]["webui"], self.is_cloud
270271
)
@@ -396,7 +397,9 @@ def _fetch_page_attachments(
396397
)
397398
continue
398399

399-
logger.info(f"Processing attachment: {attachment['title']}")
400+
logger.info(
401+
f"Processing attachment: {attachment['title']} attached to page {page['title']}"
402+
)
400403

401404
# Attempt to get textual content or image summarization:
402405
object_url = build_confluence_document_id(
@@ -458,7 +461,9 @@ def _fetch_document_batches(
458461
- Attempt to convert it with convert_attachment_to_content(...)
459462
- If successful, create a new Section with the extracted text or summary.
460463
"""
461-
doc_count = 0
464+
465+
# number of documents/errors yielded
466+
yield_count = 0
462467

463468
checkpoint = copy.deepcopy(checkpoint)
464469
prev_doc_ids = checkpoint.last_seen_doc_ids
@@ -474,12 +479,17 @@ def _fetch_document_batches(
474479
expand=",".join(_PAGE_EXPANSION_FIELDS),
475480
limit=2 * self.batch_size,
476481
):
482+
# create checkpoint after enough documents have been processed
483+
if yield_count >= self.batch_size:
484+
return checkpoint
485+
477486
if page["id"] in prev_doc_ids:
478487
# There are a few seconds of fuzziness in the request,
479488
# so we skip if we saw this page on the last run
480489
continue
481490
# Build doc from page
482491
doc_or_failure = self._convert_page_to_document(page)
492+
yield_count += 1
483493

484494
if isinstance(doc_or_failure, ConnectorFailure):
485495
yield doc_or_failure
@@ -497,14 +507,10 @@ def _fetch_document_batches(
497507
continue
498508

499509
# yield completed document
500-
doc_count += 1
510+
501511
checkpoint.last_seen_doc_ids.append(page["id"])
502512
yield doc_or_failure
503513

504-
# create checkpoint after enough documents have been processed
505-
if doc_count >= self.batch_size:
506-
return checkpoint
507-
508514
checkpoint.has_more = False
509515
return checkpoint
510516

backend/tests/unit/onyx/connectors/confluence/test_confluence_checkpointing.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def test_load_from_checkpoint_happy_path(
148148
assert confluence_client is not None, "bad test setup"
149149
paginated_cql_mock = cast(MagicMock, confluence_client.paginated_cql_retrieval)
150150
paginated_cql_mock.side_effect = [
151-
[mock_page1, mock_page2],
151+
[mock_page1, mock_page2, mock_page3],
152152
[], # comments
153153
[], # attachments
154154
[], # comments
@@ -366,22 +366,16 @@ def test_checkpoint_progress(
366366
confluence_connector, 0, end_time
367367
)
368368

369-
assert len(outputs) == 2
369+
assert len(outputs) == 1
370370

371371
first_checkpoint = outputs[0].next_checkpoint
372-
last_checkpoint = outputs[-1].next_checkpoint
373372

374373
assert first_checkpoint == ConfluenceCheckpoint(
375374
last_updated=later_timestamp.timestamp(),
376-
has_more=True,
375+
has_more=False,
377376
last_seen_doc_ids=["1", "2"],
378377
)
379378

380-
# Verify checkpoint contains both document IDs and latest timestamp
381-
assert last_checkpoint == ConfluenceCheckpoint(
382-
last_updated=later_timestamp.timestamp(), has_more=False, last_seen_doc_ids=[]
383-
)
384-
385379
assert len(outputs[0].items) == 2
386380
assert isinstance(outputs[0].items[0], Document)
387381
assert outputs[0].items[0].semantic_identifier == "Page 1"
@@ -404,11 +398,12 @@ def test_checkpoint_progress(
404398
]
405399

406400
# Use the checkpoint from first run
401+
first_checkpoint.has_more = True
407402
outputs_with_checkpoint = load_everything_from_checkpoint_connector_from_checkpoint(
408403
confluence_connector, 0, end_time, first_checkpoint
409404
)
410405

411-
# Verify no documents were processed since they were in last_seen_doc_ids
406+
# Verify only the new page was processed since the others were in last_seen_doc_ids
412407
assert len(outputs_with_checkpoint) == 1
413408
assert len(outputs_with_checkpoint[0].items) == 1
414409
assert isinstance(outputs_with_checkpoint[0].items[0], Document)

0 commit comments

Comments
 (0)