onyx-dot-app · Weves · Apr 19, 2025 · Apr 18, 2025 · Apr 18, 2025
@@ -265,6 +265,7 @@ def _convert_page_to_document(
             # Extract basic page information
             page_id = page["id"]
             page_title = page["title"]
+            logger.info(f"Converting page {page_title} to document")
             page_url = build_confluence_document_id(
                 self.wiki_base, page["_links"]["webui"], self.is_cloud
             )
@@ -458,7 +459,9 @@ def _fetch_document_batches(
              - Attempt to convert it with convert_attachment_to_content(...)
              - If successful, create a new Section with the extracted text or summary.
         """
-        doc_count = 0
+
+        # number of documents/errors yielded
+        yield_count = 0
 
         checkpoint = copy.deepcopy(checkpoint)
         prev_doc_ids = checkpoint.last_seen_doc_ids
@@ -474,12 +477,17 @@ def _fetch_document_batches(
             expand=",".join(_PAGE_EXPANSION_FIELDS),
             limit=2 * self.batch_size,
         ):
+            # create checkpoint after enough documents have been processed
+            if yield_count >= self.batch_size:
+                return checkpoint
+
             if page["id"] in prev_doc_ids:
                 # There are a few seconds of fuzziness in the request,
                 # so we skip if we saw this page on the last run
                 continue
             # Build doc from page
             doc_or_failure = self._convert_page_to_document(page)
+            yield_count += 1
 
             if isinstance(doc_or_failure, ConnectorFailure):
                 yield doc_or_failure
@@ -497,14 +505,10 @@ def _fetch_document_batches(
                 continue
 
             # yield completed document
-            doc_count += 1
+
             checkpoint.last_seen_doc_ids.append(page["id"])
             yield doc_or_failure
 
-            # create checkpoint after enough documents have been processed
-            if doc_count >= self.batch_size:
-                return checkpoint
-
         checkpoint.has_more = False
         return checkpoint