Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/+e27ee18a.improvement.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:py:meth:`StructureCollection.select <opencosmo.StructureCollection.select>`,:py:meth:`StructureCollection.drop <opencosmo.StructureCollection.drop>`, and :py:meth:`StructureCollection.evaluate <opencosmo.StructureCollection.evaluate>` now support specifying columns in nested collections.
34 changes: 34 additions & 0 deletions docs/source/evaluating.rst
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,40 @@ There are two clear differences between this example and the one with a single d

You will also notice that we set :code:`format = "numpy"` in the call to :py:meth:`evaluate <opencosmo.StructureCollection.evaluate>`. With this option set, the data will be provided to our function as a dictionary of scalars (for halo_properties) and a dictionary of numpy arrays (for dm_particles). If we had chosen instead :code:`format = "astropy"` (the default), the data would have been provided as a dictionary of astropy quantities and a dictionary of quantity arrays, respectively.

If you have a nested :py:class:`StructureCollection <opencosmo.StructureCollection>`, it will be passed to your function directly. You can still select specific columns from these datasets though:


.. code-block:: python

oc.open("haloproperties.hdf5", "haloparticles.hdf5", "galaxyproperties.hdf5", "galaxyparticles.hdf5")

def my_cool_function(halo_properties, dm_particles, galaxies):
# the "galaxies" argument will be a StructureCollection
# You can use its data directly, iterate through its galaxies
# or further filter.

# do fun stuff here.


collection = collection.evaluate(
offset,
insert=True,
format="numpy",
dm_particles=["x","y","z"],
halo_properties=[
"fof_halo_center_x",
"fof_halo_center_y",
"fof_halo_center_z",
"sod_halo_com_x",
"sod_halo_com_y",
"sod_halo_com_z"
]
galaxies = {
"galaxy_properties": ["gal_mass_bar", "gal_mass_star"],
"star_particles": ["x", "y", "z"]
}
)

Evaluating on a Single Dataset in a Structure Collection
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
45 changes: 22 additions & 23 deletions src/opencosmo/collection/structure/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from astropy.units import Quantity # type: ignore

from opencosmo import dataset as ds
from opencosmo.dataset.evaluate import visit_dataset
from opencosmo.evaluate import insert, make_output_from_first_values, prepare_kwargs

if TYPE_CHECKING:
Expand All @@ -18,7 +19,7 @@
def visit_structure_collection(
function: Callable,
spec: Mapping[str, Optional[list[str]]],
collection: "StructureCollection",
collection: StructureCollection,
format: str = "astropy",
dtype: Optional[DTypeLike] = None,
evaluator_kwargs: dict[str, Any] = {},
Expand All @@ -31,10 +32,6 @@ def visit_structure_collection(
dtype = np.float64

storage = __make_output(function, to_visit, format, kwargs, iterable_kwargs)

if isinstance(to_visit, ds.Dataset):
raise NotImplementedError()

for i, structure in enumerate(to_visit.objects()):
if i == 0:
continue
Expand Down Expand Up @@ -65,7 +62,7 @@ def __make_input(structure: dict, format: str = "astropy"):

def __make_output(
function: Callable,
collection: "StructureCollection",
collection: StructureCollection,
format: str = "astropy",
kwargs: dict[str, Any] = {},
iterable_kwargs: dict[str, Sequence] = {},
Expand All @@ -87,21 +84,9 @@ def __make_output(


def __prepare_collection(
spec: dict[str, Optional[list[str]]], collection: "StructureCollection"
):
if len(spec.keys()) == 1:
ds_name = next(iter(spec.keys()))
dataset = collection[ds_name]
if isinstance(dataset, ds.Dataset):
columns = spec[ds_name]
if columns is not None:
return dataset.select(columns)
return dataset
else:
raise NotImplementedError
else:
collection = collection.with_datasets(list(spec.keys()))

spec: dict[str, Optional[list[str]]], collection: StructureCollection
) -> StructureCollection:
collection = collection.with_datasets(list(spec.keys()))
selections = {ds_name: cols for ds_name, cols in spec.items() if cols is not None}
collection = collection.select(**selections)
return collection
Expand All @@ -110,7 +95,7 @@ def __prepare_collection(
def __verify(
function: Callable,
spec: dict[str, Optional[list[str]]],
collection: "StructureCollection",
collection: StructureCollection,
kwarg_keys: Iterable[str],
):
datasets_in_collection = set(collection.keys())
Expand All @@ -134,7 +119,21 @@ def __verify(
continue
dataset = collection[ds_name]
if not isinstance(dataset, ds.Dataset):
raise NotImplementedError
if not isinstance(columns_in_spec, dict):
raise ValueError(
"When passing columns to a nested structure collection, the argument should be a dictionary"
)
for key, value in columns_in_spec.items():
if key not in dataset.keys():
raise ValueError(
"No dataset {key} found in structure collection"
)
elif set(dataset[key].columns).difference(value):
raise ValueError(
"Missing some requested columns in this datset!"
)
continue

columns_to_check = set(columns_in_spec)
columns_in_dataset = set(dataset.columns)
if not columns_to_check.issubset(columns_in_dataset):
Expand Down
65 changes: 47 additions & 18 deletions src/opencosmo/collection/structure/structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,6 @@ def open(
) -> StructureCollection:
return sio.build_structure_collection(targets, ignore_empty)

@classmethod
def read(cls, *args, **kwargs) -> StructureCollection:
raise NotImplementedError

@property
def header(self):
return self.__header
Expand Down Expand Up @@ -251,7 +247,8 @@ def evaluate(
You can substantially improve the performance of this method by specifying
which data is actually needed to do the computation. This method will
automatically select the requested data, avoiding reading unneeded data
from disk.
from disk. The semantics for specifying the columns is identical to
:py:meth:`select <opencosmo.StructureCollection.select>`.

The function passed to this method must take arguments that match the names
of datasets that are stored in this collection. You can specify specific
Expand Down Expand Up @@ -287,7 +284,12 @@ def computation(halo_properties, dm_particles):
way will not respond to changes in unit convention.

It is not required to pass a list of column names for a given dataset. If a list
is not provided, all columns will be passed to the computation function.
is not provided, all columns will be passed to the computation function. Data will
be passed into the function as numpy arrays or astropy tables, depending on the
value of the "format" argument. However if the evaluation involes a nested
structure collection (e.g. a galaxy collection inside a structure collection)
in addition to other datasets, the nested collection will be passed to your
function as a StructureCollection.

For more details and advanced usage see :ref:`Evaluating on Structure Collections`

Expand Down Expand Up @@ -429,7 +431,9 @@ def filter(self, *masks, on_galaxies: bool = False) -> StructureCollection:
filtered, self.__header, self.__datasets, self.__links, self.__hide_source
)

def select(self, **column_selections: str | Iterable[str]) -> StructureCollection:
def select(
self, **column_selections: str | Iterable[str] | dict
) -> StructureCollection:
"""
Update a dataset in the collection collection to only include the
columns specified. The name of the arguments to this function should be
Expand All @@ -446,12 +450,27 @@ def select(self, **column_selections: str | Iterable[str]) -> StructureCollectio
remove entire datasets from the collection with
:py:meth:`with_datasets <opencosmo.StructureCollection.with_datasets>`

For nested structure collections, such as galaxies within halos, you can pass
a nested dictionary:

.. code-block:: python

collection = oc.open("haloproperties.hdf5", "haloparticles.hdf5", "galaxyproperties.hdf5", "galaxyparticles.hdf5")

collection = collection.select(
halo_properties = ["fof_halo_mass", "sod_halo_mass", "sod_halo_cdelta"],
dm_particles = ["x", "y", "z"]
galaxies = {
"galaxy_properties": ["gal_mass_bar", "gal_mass_star"],
"star_particles": ["x", "y", "z"]
}
)


Parameters
----------
**column_selections : str | Iterable[str]
The columns to select from a given dataset
**column_selections : str | Iterable[str] | dict[str, Iterable[str]]
The columns to select from a given dataset or sub-collection

dataset : str
The dataset to select from.
Expand All @@ -478,13 +497,18 @@ def select(self, **column_selections: str | Iterable[str]) -> StructureCollectio
elif dataset not in self.__datasets:
raise ValueError(f"Dataset {dataset} not found in collection.")

output_ds = self.__datasets[dataset]
new_ds = self.__datasets[dataset]

if not isinstance(output_ds, oc.Dataset):
raise NotImplementedError
if not isinstance(new_ds, oc.Dataset):
if not isinstance(columns, dict):
raise ValueError(
"When working with nested structure collections, the argument should be a dictionary!"
)
new_ds = new_ds.select(**columns)
else:
new_ds = new_ds.select(columns)

new_dataset = output_ds.select(columns)
new_datasets[dataset] = new_dataset
new_datasets[dataset] = new_ds

return StructureCollection(
new_source,
Expand All @@ -497,10 +521,10 @@ def select(self, **column_selections: str | Iterable[str]) -> StructureCollectio
def drop(self, **columns_to_drop):
"""
Update the linked collection by dropping the specified columns
in the specified datasets. Follows the exact same semantics as
in the specified datasets. This method follows the exact same semantics as
:py:meth:`StructureCollection.select <opencosmo.StructureCollection.select>`.
Argument names should be datasets in this collection, and the argument
values should be a string or list of strings.
values should be a string, list of strings, or dictionary.

Datasets that are not included will not be modified. You can drop
entire datasets with :py:meth:`with_datasets <opencosmo.StructureCollection.with_datasets>`
Expand Down Expand Up @@ -535,7 +559,12 @@ def drop(self, **columns_to_drop):

elif dataset_name not in self.__datasets:
raise ValueError(f"Dataset {dataset_name} not found in collection.")
new_ds = self.__datasets[dataset_name].drop(columns)
new_ds = self.__datasets[dataset_name]
if isinstance(new_ds, oc.Dataset):
new_ds = new_ds.drop(columns)
elif isinstance(new_ds.StructureCollection):
new_ds = new_ds.drop(**columns)

new_datasets[dataset_name] = new_ds

return StructureCollection(
Expand Down Expand Up @@ -878,7 +907,7 @@ def with_datasets(self, datasets: list[str]):
"""

if not isinstance(datasets, list):
raise ValueError("Expected a list with at least one entries")
raise ValueError("Expected a list with at least one entry")

known_datasets = set(self.keys())
requested_datasets = set(datasets)
Expand Down
2 changes: 1 addition & 1 deletion src/opencosmo/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -621,7 +621,7 @@ def sort_by(self, column: str, invert: bool = False) -> Dataset:


"""
new_state = self.__state.sort_by(column, self.__handler, invert)
new_state = self.__state.sort_by(column, invert)
return Dataset(
self.__handler,
self.__header,
Expand Down
4 changes: 2 additions & 2 deletions src/opencosmo/dataset/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def __prepare(function: Callable, dataset: "Dataset", evaluator_kwargs: Iterable
function_arguments = set(signature(function).parameters.keys())

input_columns = function_arguments.intersection(dataset.columns)
if len(input_columns) == 0 and dataset.dtype in function_arguments:
if len(input_columns) == 0 and len(function_arguments) == 1:
return dataset
return dataset.select(input_columns)

Expand All @@ -122,7 +122,7 @@ def __verify(function: Callable, dataset: "Dataset", kwarg_names: Iterable[str])
missing = required_parameters - dataset_columns - kwarg_names
if not missing:
return
elif len(missing) > 1 or next(iter(missing)) != dataset.dtype:
elif len(missing) > 1:
raise ValueError(
f"All inputs to the function must either be column names or passed as keyword arguments! Found unknown input(s) {','.join(missing)}"
)
2 changes: 1 addition & 1 deletion src/opencosmo/dataset/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def select(self, columns: str | Iterable[str]):
new_derived,
)

def sort_by(self, column_name: str, handler: "DatasetHandler", invert: bool):
def sort_by(self, column_name: str, invert: bool):
if column_name not in self.columns:
raise ValueError(f"This dataset has no column {column_name}")
return DatasetState(
Expand Down
Loading
Loading