NVIDIA-NeMo
diff --git a/‎benchmarking/scripts/arxiv_e2e_pipeline_benchmark.py‎
Lines changed: 5 additions & 11 deletions b/‎benchmarking/scripts/arxiv_e2e_pipeline_benchmark.py‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎docs/about/concepts/text/data-acquisition-concepts.md‎
Lines changed: 2 additions & 3 deletions b/‎docs/about/concepts/text/data-acquisition-concepts.md‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/curate-text/load-data/custom.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/curate-text/load-data/custom.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo_curator/stages/text/download/README.md‎
Lines changed: 2 additions & 3 deletions b/‎nemo_curator/stages/text/download/README.md‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎nemo_curator/stages/text/download/base/extract.py‎
Lines changed: 1 addition & 73 deletions b/‎nemo_curator/stages/text/download/base/extract.py‎
Lines changed: 1 addition & 73 deletions
diff --git a/‎nemo_curator/stages/text/download/base/iterator.py‎
Lines changed: 44 additions & 19 deletions b/‎nemo_curator/stages/text/download/base/iterator.py‎
Lines changed: 44 additions & 19 deletions
diff --git a/‎nemo_curator/stages/text/download/base/stage.py‎
Lines changed: 10 additions & 18 deletions b/‎nemo_curator/stages/text/download/base/stage.py‎
Lines changed: 10 additions & 18 deletions
@@ -47,8 +47,7 @@
 from nemo_curator.stages.text.download.arxiv.extract import ArxivExtractor
 from nemo_curator.stages.text.download.arxiv.iterator import ArxivIterator
 from nemo_curator.stages.text.download.base import URLGenerator
-from nemo_curator.stages.text.download.base.extract import DocumentExtractStage
-from nemo_curator.stages.text.download.base.iterator import DocumentIterateStage
+from nemo_curator.stages.text.download.base.iterator import DocumentIterateExtractStage
 from nemo_curator.stages.text.download.base.url_generation import URLGenerationStage
 from nemo_curator.stages.text.filters import (
     FastTextLangId,
@@ -119,20 +118,15 @@ def __post_init__(self) -> None:
             limit=self.url_limit,
         )
 
-        # Iterate stage (extracts records from tar files)
-        iterate_stage = DocumentIterateStage(
+        # Iterate-extract stage (extracts records from tar files and cleans LaTeX to text)
+        iterate_extract_stage = DocumentIterateExtractStage(
             iterator=ArxivIterator(log_frequency=self.log_frequency),
-            record_limit=self.record_limit,
-            add_filename_column=self.add_filename_column,
-        )
-
-        # Extract stage (cleans LaTeX to text)
-        extract_stage = DocumentExtractStage(
             extractor=ArxivExtractor(),
+            record_limit=self.record_limit,
             add_filename_column=self.add_filename_column,
         )
 
-        self.stages = [url_stage, iterate_stage, extract_stage]
+        self.stages = [url_stage, iterate_extract_stage]
         self.name = "local_arxiv_extract"
         super().__init__()
 
 
@@ -16,12 +16,11 @@ This guide covers the core concepts for acquiring and processing text data from
 
 ## Overview
 
-Data acquisition in NeMo Curator follows a four-stage architecture:
+Data acquisition in NeMo Curator follows a three-stage architecture:
 
 1. **Generate URLs**: Discover and generate download URLs from minimal input
 2. **Download**: Retrieve raw data files from remote sources
-3. **Iterate**: Extract individual records from downloaded containers
-4. **Extract**: Convert raw content to clean, structured text
+3. **Iterate** and **Extract**: Extract individual records from downloaded containers and convert raw content to clean, structured text
 
 This process transforms diverse remote data sources into a standardized `DocumentBatch` that can be used throughout the text curation pipeline.
 
 
@@ -16,7 +16,7 @@ Create custom data loading pipelines using Curator. This guide shows how to buil
 
 ## How It Works
 
-Curator uses the same **4-step pipeline pattern** described in {ref}`Data Acquisition Concepts <about-concepts-text-data-acquisition>` for custom data loading. Each step uses an abstract base class with corresponding processing stages that compose into pipelines.
+Curator uses the same **3-step pipeline pattern** described in {ref}`Data Acquisition Concepts <about-concepts-text-data-acquisition>` for custom data loading. Each step uses an abstract base class with corresponding processing stages that compose into pipelines.
 
 ---
 
 
@@ -2,13 +2,12 @@
 
 ## 📁 Structure Overview
 
-The framework follows a **4-step pipeline pattern** where each step is implemented as an abstract base class with corresponding stages:
+The framework follows a **3-step pipeline pattern** where each step is implemented as an abstract base class with corresponding stages:
 
 ```
 1. URLGenerator → URLGenerationStage    (URLs from config/input)
 2. DocumentDownloader → DocumentDownloadStage    (local files from URLs)
-3. DocumentIterator → DocumentIterateStage    (raw records from files)
-4. DocumentExtractor → DocumentExtractStage    (structured data from records)
+3. DocumentIterator and DocumentExtractor → DocumentIterateExtractStage    (structured data from files)
 ```
 
 ## 🛠️ Implementation Steps
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,16 +13,8 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from typing import Any
 
-import pandas as pd
-from loguru import logger
-
-from nemo_curator.stages.base import ProcessingStage
-from nemo_curator.tasks import DocumentBatch
-from nemo_curator.utils.column_utils import resolve_filename_column
-
 
 class DocumentExtractor(ABC):
     """Abstract base class for document extractors.
@@ -45,67 +37,3 @@ def input_columns(self) -> list[str]:
     def output_columns(self) -> list[str]:
         """Define output columns - produces DocumentBatch with records."""
         ...
-
-
-@dataclass
-class DocumentExtractStage(ProcessingStage[DocumentBatch, DocumentBatch]):
-    """Stage that extracts structured content from raw records.
-
-    Takes DocumentBatch with raw content and produces DocumentBatch with extracted content.
-    This is for cases where iteration and extraction are separate steps.
-    """
-
-    extractor: DocumentExtractor
-    add_filename_column: bool | str = True
-
-    def __post_init__(self):
-        """Initialize the stage."""
-        self.filename_col = resolve_filename_column(self.add_filename_column)
-        self.name = f"extract_{self.extractor.__class__.__name__.lower()}"
-
-    def inputs(self) -> tuple[list[str], list[str]]:
-        """Define input requirements - expects DocumentBatch with dict records."""
-        return (["data"], self.extractor.input_columns() + ([self.filename_col] if self.add_filename_column else []))  # type: ignore[reportReturnType]
-
-    def outputs(self) -> tuple[list[str], list[str]]:
-        """Define output - produces DocumentBatch with processed records."""
-        return (["data"], self.extractor.output_columns() + ([self.filename_col] if self.add_filename_column else []))  # type: ignore[reportReturnType]
-
-    def process(self, task: DocumentBatch) -> DocumentBatch:
-        """Extract structured content from raw records.
-
-        Args:
-            task (DocumentBatch): Batch containing records
-
-        Returns:
-            DocumentBatch: Batch containing extracted records
-        """
-        extracted_records = []
-
-        for _, row in task.data.iterrows():
-            # Convert pandas Series to dict
-            record_dict = row.to_dict()
-
-            # Extract structured content
-            extracted = self.extractor.extract(record_dict)
-            if extracted is not None:
-                if self.add_filename_column:
-                    if self.filename_col in extracted:
-                        msg = f"Since add_filename_col is specified, we'll overwrite ({self.filename_col}) from the input data."
-                        logger.warning(msg)
-
-                    extracted[self.filename_col] = record_dict[self.filename_col]  # type: ignore[reportReturnType]
-                extracted_records.append(extracted)
-
-        # Convert to DataFrame
-        df = pd.DataFrame(extracted_records)
-
-        return DocumentBatch(
-            task_id=task.task_id,
-            dataset_name=task.dataset_name,
-            data=df,
-            _metadata={
-                **task._metadata,
-            },
-            _stage_perf=task._stage_perf,
-        )
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 from nemo_curator.tasks import DocumentBatch, FileGroupTask
 from nemo_curator.utils.column_utils import resolve_filename_column
 
+from .extract import DocumentExtractor
+
 
 class DocumentIterator(ABC):
     """Abstract base class for document iterators.
@@ -45,54 +47,77 @@ def output_columns(self) -> list[str]:
 
 
 @dataclass
-class DocumentIterateStage(ProcessingStage[FileGroupTask, DocumentBatch]):
-    """Stage that iterates through downloaded files and extracts records.
+class DocumentIterateExtractStage(ProcessingStage[FileGroupTask, DocumentBatch]):
+    """Stage that iterates through downloaded files with DocumentIterator,
+    then extracts structured content from raw records with DocumentExtractor.
 
-    Takes local file paths and produces a DocumentBatch with records.
-    All iterators yield dict[str, str] records uniformly.
+    Takes local file paths and produces a DocumentBatch with extracted content.
+    If DocumentIterator produces the final format, then DocumentExtractor is not needed.
     """
 
     iterator: DocumentIterator
+    extractor: DocumentExtractor | None = None
     record_limit: int | None = None
     add_filename_column: bool | str = True
 
     def __post_init__(self):
         """Initialize the stage."""
         self.filename_col = resolve_filename_column(self.add_filename_column)
-        self.name = f"iterate_{self.iterator.__class__.__name__.lower()}"
+        if self.extractor:
+            self.name = f"iterate_extract_{self.iterator.__class__.__name__.lower()}_{self.extractor.__class__.__name__.lower()}"
+        else:
+            self.name = f"iterate_{self.iterator.__class__.__name__.lower()}"
 
     def inputs(self) -> tuple[list[str], list[str]]:
         """Define input requirements - expects FileGroupTask with local file paths."""
         return (["data"], [])
 
     def outputs(self) -> tuple[list[str], list[str]]:
-        """Define output - produces DocumentBatch with records."""
-        return (["data"], self.iterator.output_columns() + ([self.filename_col] if self.add_filename_column else []))  # type: ignore[reportReturnType]
+        """Define output - produces DocumentBatch with processed records."""
+        if self.extractor:
+            return (["data"], self.extractor.output_columns() + ([self.filename_col] if self.add_filename_column else []))
+        else:
+            return (["data"], self.iterator.output_columns() + ([self.filename_col] if self.add_filename_column else []))
 
     def process(self, task: FileGroupTask) -> DocumentBatch:
-        """Iterate through files and extract records.
+        """Iterate through files and extract structured content.
 
         Args:
             task (FileGroupTask): Task containing local file paths
 
         Returns:
-            DocumentBatch: Batch containing records
+            DocumentBatch: Batch containing extracted records
         """
         records = []
 
         for file_path in task.data:
             try:
                 record_count = 0
                 iterator_result = self.iterator.iterate(file_path)
-                if iterator_result is not None:
-                    for record_dict in iterator_result:
-                        if self.record_limit and record_count >= self.record_limit:
-                            break
-                        if self.add_filename_column:
-                            # TODO: Support cloud storage https://github.com/NVIDIA-NeMo/Curator/issues/779
-                            record_dict[self.filename_col] = os.path.basename(file_path)  # type: ignore[reportReturnType]
-                        records.append(record_dict)
-                        record_count += 1
+
+                if iterator_result is None:
+                    continue
+
+                for record_dict in iterator_result:
+                    if self.record_limit and record_count >= self.record_limit:
+                        break
+
+                    # Add filename early
+                    if self.add_filename_column:
+                        record_dict[self.filename_col] = os.path.basename(file_path)
+
+                    # Extract structured content
+                    extracted = self.extractor.extract(record_dict) if self.extractor else record_dict
+
+                    if extracted is None:
+                        continue
+
+                    # Ensure filename is preserved
+                    if self.add_filename_column:
+                        extracted[self.filename_col] = record_dict[self.filename_col]
+
+                    records.append(extracted)
+                    record_count += 1
 
             except Exception as e:  # noqa: BLE001
                 logger.error(f"Error iterating {file_path}: {e}")
 
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,20 +18,19 @@
 from nemo_curator.tasks import DocumentBatch, _EmptyTask
 
 from .download import DocumentDownloader, DocumentDownloadStage
-from .extract import DocumentExtractor, DocumentExtractStage
-from .iterator import DocumentIterateStage, DocumentIterator
+from .extract import DocumentExtractor
+from .iterator import DocumentIterateExtractStage, DocumentIterator
 from .url_generation import URLGenerationStage, URLGenerator
 
 
 @dataclass
 class DocumentDownloadExtractStage(CompositeStage[_EmptyTask, DocumentBatch]):
-    """Composite stage that combines URL generation, download, iterate, and extract stages.
+    """Composite stage that combines URL generation, download, and iterate-extract stages.
 
-    This supports the full 4-step pipeline pattern like Common Crawl:
+    This supports the full 3-step pipeline pattern like Common Crawl:
     1. Generate URLs from minimal input
     2. Download files from URLs
-    3. Iterate through files to extract raw records
-    4. Extract structured content from raw records
+    3. Iterate through files to extract structured content
 
     """
 
@@ -56,22 +55,15 @@ def __post_init__(self):
             downloader=self.downloader,
         )
 
-        # Iterate stage
-        iterate_stage = DocumentIterateStage(
+        # Iterate-extract stage
+        iterate_extract_stage = DocumentIterateExtractStage(
             iterator=self.iterator,
+            extractor=self.extractor,
             record_limit=self.record_limit,
             add_filename_column=self.add_filename_column,
         )
 
-        # Extract stage (if extractor provided)
-        stages = [url_stage, download_stage, iterate_stage]
-        if self.extractor:
-            extract_stage = DocumentExtractStage(
-                extractor=self.extractor,
-                add_filename_column=self.add_filename_column,
-            )
-            stages.append(extract_stage)
-
+        stages = [url_stage, download_stage, iterate_extract_stage]
         self.stages = stages
 
         url_generator_name = self.url_generator.__class__.__name__.lower()