ArgonneCPAC · AstroPatty · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/changes/+e27ee18a.improvement.rst b/changes/+e27ee18a.improvement.rst
@@ -0,0 +1 @@
+:py:meth:`StructureCollection.select <opencosmo.StructureCollection.select>`,:py:meth:`StructureCollection.drop <opencosmo.StructureCollection.drop>`, and :py:meth:`StructureCollection.evaluate <opencosmo.StructureCollection.evaluate>` now support specifying columns in nested collections.
diff --git a/docs/source/evaluating.rst b/docs/source/evaluating.rst
@@ -135,6 +135,40 @@ There are two clear differences between this example and the one with a single d
 
 You will also notice that we set :code:`format = "numpy"` in the call to :py:meth:`evaluate <opencosmo.StructureCollection.evaluate>`. With this option set, the data will be provided to our function as a dictionary of scalars (for halo_properties) and a dictionary of numpy arrays (for dm_particles). If we had chosen instead :code:`format = "astropy"` (the default), the data would have been provided as a dictionary of astropy quantities and a dictionary of quantity arrays, respectively.
 
+If you have a nested :py:class:`StructureCollection <opencosmo.StructureCollection>`, it will be passed to your function directly. You can still select specific columns from these datasets though:
+
+
+.. code-block:: python
+
+   oc.open("haloproperties.hdf5", "haloparticles.hdf5", "galaxyproperties.hdf5", "galaxyparticles.hdf5")
+
+   def my_cool_function(halo_properties, dm_particles, galaxies):
+        # the "galaxies" argument will be a StructureCollection
+        # You can use its data directly, iterate through its galaxies
+        # or further filter.
+
+        # do fun stuff here.
+
+
+   collection = collection.evaluate(
+        offset, 
+        insert=True, 
+        format="numpy",
+        dm_particles=["x","y","z"],
+        halo_properties=[
+            "fof_halo_center_x",
+            "fof_halo_center_y",
+            "fof_halo_center_z",
+            "sod_halo_com_x",
+            "sod_halo_com_y",
+            "sod_halo_com_z"
+        ]
+        galaxies = {
+            "galaxy_properties": ["gal_mass_bar", "gal_mass_star"],
+            "star_particles": ["x", "y", "z"]
+        }
+   )
+
 Evaluating on a Single Dataset in a Structure Collection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/src/opencosmo/collection/structure/evaluate.py b/src/opencosmo/collection/structure/evaluate.py
@@ -7,6 +7,7 @@
 from astropy.units import Quantity  # type: ignore
 
 from opencosmo import dataset as ds
+from opencosmo.dataset.evaluate import visit_dataset
 from opencosmo.evaluate import insert, make_output_from_first_values, prepare_kwargs
 
 if TYPE_CHECKING:
@@ -18,7 +19,7 @@
 def visit_structure_collection(
     function: Callable,
     spec: Mapping[str, Optional[list[str]]],
-    collection: "StructureCollection",
+    collection: StructureCollection,
     format: str = "astropy",
     dtype: Optional[DTypeLike] = None,
     evaluator_kwargs: dict[str, Any] = {},
@@ -31,10 +32,6 @@ def visit_structure_collection(
         dtype = np.float64
 
     storage = __make_output(function, to_visit, format, kwargs, iterable_kwargs)
-
-    if isinstance(to_visit, ds.Dataset):
-        raise NotImplementedError()
-
     for i, structure in enumerate(to_visit.objects()):
         if i == 0:
             continue
@@ -65,7 +62,7 @@ def __make_input(structure: dict, format: str = "astropy"):
 
 def __make_output(
     function: Callable,
-    collection: "StructureCollection",
+    collection: StructureCollection,
     format: str = "astropy",
     kwargs: dict[str, Any] = {},
     iterable_kwargs: dict[str, Sequence] = {},
@@ -87,21 +84,9 @@ def __make_output(
 
 
 def __prepare_collection(
-    spec: dict[str, Optional[list[str]]], collection: "StructureCollection"
-):
-    if len(spec.keys()) == 1:
-        ds_name = next(iter(spec.keys()))
-        dataset = collection[ds_name]
-        if isinstance(dataset, ds.Dataset):
-            columns = spec[ds_name]
-            if columns is not None:
-                return dataset.select(columns)
-            return dataset
-        else:
-            raise NotImplementedError
-    else:
-        collection = collection.with_datasets(list(spec.keys()))
-
+    spec: dict[str, Optional[list[str]]], collection: StructureCollection
+) -> StructureCollection:
+    collection = collection.with_datasets(list(spec.keys()))
     selections = {ds_name: cols for ds_name, cols in spec.items() if cols is not None}
     collection = collection.select(**selections)
     return collection
@@ -110,7 +95,7 @@ def __prepare_collection(
 def __verify(
     function: Callable,
     spec: dict[str, Optional[list[str]]],
-    collection: "StructureCollection",
+    collection: StructureCollection,
     kwarg_keys: Iterable[str],
 ):
     datasets_in_collection = set(collection.keys())
@@ -134,7 +119,21 @@ def __verify(
             continue
         dataset = collection[ds_name]
         if not isinstance(dataset, ds.Dataset):
-            raise NotImplementedError
+            if not isinstance(columns_in_spec, dict):
+                raise ValueError(
+                    "When passing columns to a nested structure collection, the argument should be a dictionary"
+                )
+                for key, value in columns_in_spec.items():
+                    if key not in dataset.keys():
+                        raise ValueError(
+                            "No dataset {key} found in structure collection"
+                        )
+                    elif set(dataset[key].columns).difference(value):
+                        raise ValueError(
+                            "Missing some requested columns in this datset!"
+                        )
+            continue
+
         columns_to_check = set(columns_in_spec)
         columns_in_dataset = set(dataset.columns)
         if not columns_to_check.issubset(columns_in_dataset):

diff --git a/src/opencosmo/collection/structure/structure.py b/src/opencosmo/collection/structure/structure.py
@@ -97,10 +97,6 @@ def open(
     ) -> StructureCollection:
         return sio.build_structure_collection(targets, ignore_empty)
 
-    @classmethod
-    def read(cls, *args, **kwargs) -> StructureCollection:
-        raise NotImplementedError
-
     @property
     def header(self):
         return self.__header
@@ -251,7 +247,8 @@ def evaluate(
         You can substantially improve the performance of this method by specifying
         which data is actually needed to do the computation. This method will
         automatically select the requested data, avoiding reading unneeded data
-        from disk.
+        from disk. The semantics for specifying the columns is identical to
+        :py:meth:`select <opencosmo.StructureCollection.select>`.
 
         The function passed to this method must take arguments that match the names
         of datasets that are stored in this collection. You can specify specific
@@ -287,7 +284,12 @@ def computation(halo_properties, dm_particles):
         way will not respond to changes in unit convention.
 
         It is not required to pass a list of column names for a given dataset. If a list
-        is not provided, all columns will be passed to the computation function.
+        is not provided, all columns will be passed to the computation function. Data will
+        be passed into the function as numpy arrays or astropy tables, depending on the
+        value of the "format" argument. However if the evaluation involes a nested
+        structure collection (e.g. a galaxy collection inside a structure collection)
+        in addition to other datasets, the nested collection will be passed to your
+        function as a StructureCollection.
 
         For more details and advanced usage see :ref:`Evaluating on Structure Collections`
 
@@ -429,7 +431,9 @@ def filter(self, *masks, on_galaxies: bool = False) -> StructureCollection:
             filtered, self.__header, self.__datasets, self.__links, self.__hide_source
         )
 
-    def select(self, **column_selections: str | Iterable[str]) -> StructureCollection:
+    def select(
+        self, **column_selections: str | Iterable[str] | dict
+    ) -> StructureCollection:
         """
         Update a dataset in the collection collection to only include the
         columns specified. The name of the arguments to this function should be
@@ -446,12 +450,27 @@ def select(self, **column_selections: str | Iterable[str]) -> StructureCollectio
         remove entire datasets from the collection with
         :py:meth:`with_datasets <opencosmo.StructureCollection.with_datasets>`
 
+        For nested structure collections, such as galaxies within halos, you can pass
+        a nested dictionary:
+
+        .. code-block:: python
+
+            collection = oc.open("haloproperties.hdf5", "haloparticles.hdf5", "galaxyproperties.hdf5", "galaxyparticles.hdf5")
+
+            collection = collection.select(
+                halo_properties = ["fof_halo_mass", "sod_halo_mass", "sod_halo_cdelta"],
+                dm_particles = ["x", "y", "z"]
+                galaxies = {
+                    "galaxy_properties": ["gal_mass_bar", "gal_mass_star"],
+                    "star_particles": ["x", "y", "z"]
+                }
+            )
 
 
         Parameters
         ----------
-        **column_selections : str | Iterable[str]
-            The columns to select from a given dataset
+        **column_selections : str | Iterable[str] | dict[str, Iterable[str]]
+            The columns to select from a given dataset or sub-collection
 
         dataset : str
             The dataset to select from.
@@ -478,13 +497,18 @@ def select(self, **column_selections: str | Iterable[str]) -> StructureCollectio
             elif dataset not in self.__datasets:
                 raise ValueError(f"Dataset {dataset} not found in collection.")
 
-            output_ds = self.__datasets[dataset]
+            new_ds = self.__datasets[dataset]
 
-            if not isinstance(output_ds, oc.Dataset):
-                raise NotImplementedError
+            if not isinstance(new_ds, oc.Dataset):
+                if not isinstance(columns, dict):
+                    raise ValueError(
+                        "When working with nested structure collections, the argument should be a dictionary!"
+                    )
+                new_ds = new_ds.select(**columns)
+            else:
+                new_ds = new_ds.select(columns)
 
-            new_dataset = output_ds.select(columns)
-            new_datasets[dataset] = new_dataset
+            new_datasets[dataset] = new_ds
 
         return StructureCollection(
             new_source,
@@ -497,10 +521,10 @@ def select(self, **column_selections: str | Iterable[str]) -> StructureCollectio
     def drop(self, **columns_to_drop):
         """
         Update the linked collection by dropping the specified columns
-        in the specified datasets. Follows the exact same semantics as
+        in the specified datasets. This method follows the exact same semantics as
         :py:meth:`StructureCollection.select <opencosmo.StructureCollection.select>`.
         Argument names should be datasets in this collection, and the argument
-        values should be a string or list of strings.
+        values should be a string, list of strings, or dictionary.
 
         Datasets that are not included will not be modified. You can drop
         entire datasets with :py:meth:`with_datasets <opencosmo.StructureCollection.with_datasets>`
@@ -535,7 +559,12 @@ def drop(self, **columns_to_drop):
 
             elif dataset_name not in self.__datasets:
                 raise ValueError(f"Dataset {dataset_name} not found in collection.")
-            new_ds = self.__datasets[dataset_name].drop(columns)
+            new_ds = self.__datasets[dataset_name]
+            if isinstance(new_ds, oc.Dataset):
+                new_ds = new_ds.drop(columns)
+            elif isinstance(new_ds.StructureCollection):
+                new_ds = new_ds.drop(**columns)
+
             new_datasets[dataset_name] = new_ds
 
         return StructureCollection(
@@ -878,7 +907,7 @@ def with_datasets(self, datasets: list[str]):
         """
 
         if not isinstance(datasets, list):
-            raise ValueError("Expected a list with at least one entries")
+            raise ValueError("Expected a list with at least one entry")
 
         known_datasets = set(self.keys())
         requested_datasets = set(datasets)

diff --git a/src/opencosmo/dataset/dataset.py b/src/opencosmo/dataset/dataset.py
@@ -621,7 +621,7 @@ def sort_by(self, column: str, invert: bool = False) -> Dataset:
 
 
         """
-        new_state = self.__state.sort_by(column, self.__handler, invert)
+        new_state = self.__state.sort_by(column, invert)
         return Dataset(
             self.__handler,
             self.__header,

diff --git a/src/opencosmo/dataset/evaluate.py b/src/opencosmo/dataset/evaluate.py
@@ -105,7 +105,7 @@ def __prepare(function: Callable, dataset: "Dataset", evaluator_kwargs: Iterable
     function_arguments = set(signature(function).parameters.keys())
 
     input_columns = function_arguments.intersection(dataset.columns)
-    if len(input_columns) == 0 and dataset.dtype in function_arguments:
+    if len(input_columns) == 0 and len(function_arguments) == 1:
         return dataset
     return dataset.select(input_columns)
 
@@ -122,7 +122,7 @@ def __verify(function: Callable, dataset: "Dataset", kwarg_names: Iterable[str])
     missing = required_parameters - dataset_columns - kwarg_names
     if not missing:
         return
-    elif len(missing) > 1 or next(iter(missing)) != dataset.dtype:
+    elif len(missing) > 1:
         raise ValueError(
             f"All inputs to the function must either be column names or passed as keyword arguments! Found unknown input(s) {','.join(missing)}"
         )
diff --git a/src/opencosmo/dataset/state.py b/src/opencosmo/dataset/state.py
@@ -351,7 +351,7 @@ def select(self, columns: str | Iterable[str]):
             new_derived,
         )
 
-    def sort_by(self, column_name: str, handler: "DatasetHandler", invert: bool):
+    def sort_by(self, column_name: str, invert: bool):
         if column_name not in self.columns:
             raise ValueError(f"This dataset has no column {column_name}")
         return DatasetState(
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		:py:meth:`StructureCollection.select <opencosmo.StructureCollection.select>`,:py:meth:`StructureCollection.drop <opencosmo.StructureCollection.drop>`, and :py:meth:`StructureCollection.evaluate <opencosmo.StructureCollection.evaluate>` now support specifying columns in nested collections.