Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions backend/onyx/connectors/confluence/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ def _convert_page_to_document(
# Extract basic page information
page_id = page["id"]
page_title = page["title"]
logger.info(f"Converting page {page_title} to document")
page_url = build_confluence_document_id(
self.wiki_base, page["_links"]["webui"], self.is_cloud
)
Expand Down Expand Up @@ -458,7 +459,9 @@ def _fetch_document_batches(
- Attempt to convert it with convert_attachment_to_content(...)
- If successful, create a new Section with the extracted text or summary.
"""
doc_count = 0

# number of documents/errors yielded
yield_count = 0

checkpoint = copy.deepcopy(checkpoint)
prev_doc_ids = checkpoint.last_seen_doc_ids
Expand All @@ -474,12 +477,17 @@ def _fetch_document_batches(
expand=",".join(_PAGE_EXPANSION_FIELDS),
limit=2 * self.batch_size,
):
# create checkpoint after enough documents have been processed
if yield_count >= self.batch_size:
return checkpoint

if page["id"] in prev_doc_ids:
# There are a few seconds of fuzziness in the request,
# so we skip if we saw this page on the last run
continue
# Build doc from page
doc_or_failure = self._convert_page_to_document(page)
yield_count += 1

if isinstance(doc_or_failure, ConnectorFailure):
yield doc_or_failure
Expand All @@ -497,14 +505,10 @@ def _fetch_document_batches(
continue

# yield completed document
doc_count += 1

checkpoint.last_seen_doc_ids.append(page["id"])
yield doc_or_failure

# create checkpoint after enough documents have been processed
if doc_count >= self.batch_size:
return checkpoint

checkpoint.has_more = False
return checkpoint

Expand Down
Loading