Make generator configurable for stream executor (#59)

amitschang · web-flow · commit 8921afadcb8a · 2024-03-15T14:39:38.000-04:00
This makes it possible to feed some data to the first tasks instead of monotonic batches. This also means the iterable might terminate prior to max_batches.
diff --git a/dplutils/pipeline/ray.py b/dplutils/pipeline/ray.py
@@ -153,6 +153,8 @@ class RayStreamGraphExecutor(StreamingGraphExecutor):
         all pending tasks for ray_poll_timeout seconds. The timeout gives
         opportunity to re-evaluate cluster resources in case it has expanded
         since last scheduling loop
+      \*args, \*\*kwargs: These are passed to
+        :py:class:`StreamingGraphexecutor<dplutils.pipeline.stream.StreamingGraphExecutor>`
     """
     def __init__(self, *args, ray_poll_timeout: int = 20, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/dplutils/pipeline/stream.py b/dplutils/pipeline/stream.py
@@ -3,8 +3,9 @@
 import networkx as nx
 from abc import ABC, abstractmethod
 from collections import deque
+from collections.abc import Generator
 from dataclasses import dataclass, field
-from typing import Any
+from typing import Any, Callable
 from dplutils.pipeline import PipelineTask, PipelineExecutor
 from dplutils.pipeline.utils import deque_extract
 
@@ -56,6 +57,15 @@ class StreamingGraphExecutor(PipelineExecutor, ABC):
     default, for each run, it generates a indefinite stream of input dataframes
     tagged with a monotonically incrementing batch id.
 
+    Args:
+        max_batches: maximum number of batches from the source generator to feed
+          to the input task(s). Default is None, which means either exhaust the
+          source generator or run indefinitely.
+        generator: A callable that when called returns a generator which yields
+          dataframes. The yielded dataframes are assumed to be a single row, in
+          which case input task batching will be honored.
+
+
     Implementations must override abstract methods for (remote) task submission
     and polling. The following must be overriden, see their docs for more:
 
@@ -66,17 +76,18 @@ class StreamingGraphExecutor(PipelineExecutor, ABC):
     - :meth:`task_submit`
     - :meth:`task_submittable`
     """
-    def __init__(self, graph, max_batches=None):
+    def __init__(self, graph, max_batches: int=None, generator: Callable[[], Generator[pd.DataFrame, None, None]]=None):
         super().__init__(graph)
         self.max_batches = max_batches
         # make a local copy of the graph with each node wrapped in a tracker
         # object
         self.stream_graph = nx.relabel_nodes(self.graph, StreamTask)
+        self.generator_fun = generator or self.source_generator_fun
 
     def execute(self):
         self.n_sourced = 0
         self.source_exhausted = False
-        self.source_generator = self.source_generator_fun()
+        self.source_generator = self.generator_fun()
         while True:
             batch = self.execute_until_output()
             if batch is None:
@@ -120,13 +131,18 @@ def resolve_completed(self):
     def process_source(self, source):
         source_batch = []
         for _ in range(source.task.batch_size or 1):
-            source_batch.append(next(self.source_generator))
+            try:
+                source_batch.append(next(self.source_generator))
+            except StopIteration:
+                self.source_exhausted = True
+                return
             self.n_sourced += 1
             if self.n_sourced == self.max_batches:
                 self.source_exhausted = True
                 break
         source.pending.appendleft(self.task_submit(source.task, source_batch))
         source.counter += 1
+        return
 
     def enqueue_tasks(self):
         # Work through the graph in reverse order, submitting any tasks as
diff --git a/tests/pipeline/test_stream_executor.py b/tests/pipeline/test_stream_executor.py
@@ -1,3 +1,5 @@
+import pytest
+import pandas as pd
 from dplutils.pipeline import PipelineTask
 from dplutils.pipeline.stream import LocalSerialExecutor, StreamTask
 from test_suite import PipelineExecutorTestSuite
@@ -24,3 +26,17 @@ def test_stream_exhausted_indicator_considers_splits(dummy_steps):
     assert pl.task_exhausted(a_task)
     a_task.split_pending.append(1)
     assert not pl.task_exhausted(a_task)
+
+
+@pytest.mark.parametrize('max_batches', [1,10,None])
+def test_stream_executor_generator_override(max_batches):
+    st = PipelineTask('task_name', lambda x: x)
+    def generator():
+        n = 12
+        for i in range(n):
+            yield pd.DataFrame({'customgen': [i]})
+    pl = LocalSerialExecutor([st], max_batches=max_batches, generator=generator)
+    res = list(pl.run())
+    expected_rows = max_batches if max_batches else 12
+    assert len(res) == expected_rows
+    assert pd.concat(res).customgen.to_list() == list(range(expected_rows))