Skip to content

Commit 4c32820

Browse files
rlymavaylon1
andauthored
Write dimension labels to DatasetBuilder on build (#1081)
Co-authored-by: Matthew Avaylon <mavaylon1@berkeley.edu>
1 parent 77be0cc commit 4c32820

File tree

5 files changed

+404
-11
lines changed

5 files changed

+404
-11
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# HDMF Changelog
22

3+
## HDMF 3.14.3 (Upcoming)
4+
5+
### Enhancements
6+
- Added new attribute "dimension_labels" on `DatasetBuilder` which specifies the names of the dimensions used in the
7+
dataset based on the shape of the dataset data and the dimension names in the spec for the data type. This attribute
8+
is available on build (during the write process), but not on read of a dataset from a file. @rly [#1081](https://github.com/hdmf-dev/hdmf/pull/1081)
9+
310
## HDMF 3.14.2 (July 7, 2024)
411

512
### Enhancements

src/hdmf/build/builders.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,18 +330,25 @@ class DatasetBuilder(BaseBuilder):
330330
'doc': 'The datatype of this dataset.', 'default': None},
331331
{'name': 'attributes', 'type': dict,
332332
'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
333+
{'name': 'dimension_labels', 'type': tuple,
334+
'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
335+
'supplied only on build.'),
336+
'default': None},
333337
{'name': 'maxshape', 'type': (int, tuple),
334338
'doc': 'The shape of this dataset. Use None for scalars.', 'default': None},
335339
{'name': 'chunks', 'type': bool, 'doc': 'Whether or not to chunk this dataset.', 'default': False},
336340
{'name': 'parent', 'type': GroupBuilder, 'doc': 'The parent builder of this builder.', 'default': None},
337341
{'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
338342
def __init__(self, **kwargs):
339343
""" Create a Builder object for a dataset """
340-
name, data, dtype, attributes, maxshape, chunks, parent, source = getargs(
341-
'name', 'data', 'dtype', 'attributes', 'maxshape', 'chunks', 'parent', 'source', kwargs)
344+
name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
345+
'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
346+
kwargs
347+
)
342348
super().__init__(name, attributes, parent, source)
343349
self['data'] = data
344350
self['attributes'] = _copy.copy(attributes)
351+
self.__dimension_labels = dimension_labels
345352
self.__chunks = chunks
346353
self.__maxshape = maxshape
347354
if isinstance(data, BaseBuilder):
@@ -361,6 +368,11 @@ def data(self, val):
361368
raise AttributeError("Cannot overwrite data.")
362369
self['data'] = val
363370

371+
@property
372+
def dimension_labels(self):
373+
"""Labels for each dimension of this dataset from the spec."""
374+
return self.__dimension_labels
375+
364376
@property
365377
def chunks(self):
366378
"""Whether or not this dataset is chunked."""

src/hdmf/build/objectmapper.py

Lines changed: 91 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,15 @@
1010
from .errors import (BuildError, OrphanContainerBuildError, ReferenceTargetNotBuiltError, ContainerConfigurationError,
1111
ConstructError)
1212
from .manager import Proxy, BuildManager
13-
from .warnings import MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning
13+
from .warnings import (MissingRequiredBuildWarning, DtypeConversionWarning, IncorrectQuantityBuildWarning,
14+
IncorrectDatasetShapeBuildWarning)
1415
from ..container import AbstractContainer, Data, DataRegion
1516
from ..term_set import TermSetWrapper
1617
from ..data_utils import DataIO, AbstractDataChunkIterator
1718
from ..query import ReferenceResolver
1819
from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec
1920
from ..spec.spec import BaseStorageSpec
20-
from ..utils import docval, getargs, ExtenderMeta, get_docval
21+
from ..utils import docval, getargs, ExtenderMeta, get_docval, get_data_shape
2122

2223
_const_arg = '__constructor_arg'
2324

@@ -721,19 +722,34 @@ def build(self, **kwargs):
721722
if not isinstance(container, Data):
722723
msg = "'container' must be of type Data with DatasetSpec"
723724
raise ValueError(msg)
724-
spec_dtype, spec_shape, spec = self.__check_dset_spec(self.spec, spec_ext)
725+
spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
726+
dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
725727
if isinstance(spec_dtype, RefSpec):
726728
self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
727729
% (container.__class__.__name__, container.name, repr(source)))
728730
# create dataset builder with data=None as a placeholder. fill in with refs later
729-
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype.reftype)
731+
builder = DatasetBuilder(
732+
name,
733+
data=None,
734+
parent=parent,
735+
source=source,
736+
dtype=spec_dtype.reftype,
737+
dimension_labels=dimension_labels,
738+
)
730739
manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
731740
elif isinstance(spec_dtype, list):
732741
# a compound dataset
733742
self.logger.debug("Building %s '%s' as a dataset of compound dtypes (source: %s)"
734743
% (container.__class__.__name__, container.name, repr(source)))
735744
# create dataset builder with data=None, dtype=None as a placeholder. fill in with refs later
736-
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype=spec_dtype)
745+
builder = DatasetBuilder(
746+
name,
747+
data=None,
748+
parent=parent,
749+
source=source,
750+
dtype=spec_dtype,
751+
dimension_labels=dimension_labels,
752+
)
737753
manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
738754
manager))
739755
else:
@@ -744,7 +760,14 @@ def build(self, **kwargs):
744760
% (container.__class__.__name__, container.name, repr(source)))
745761
# an unspecified dtype and we were given references
746762
# create dataset builder with data=None as a placeholder. fill in with refs later
747-
builder = DatasetBuilder(name, data=None, parent=parent, source=source, dtype='object')
763+
builder = DatasetBuilder(
764+
name,
765+
data=None,
766+
parent=parent,
767+
source=source,
768+
dtype="object",
769+
dimension_labels=dimension_labels,
770+
)
748771
manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
749772
else:
750773
# a dataset that has no references, pass the conversion off to the convert_dtype method
@@ -760,7 +783,14 @@ def build(self, **kwargs):
760783
except Exception as ex:
761784
msg = 'could not resolve dtype for %s \'%s\'' % (type(container).__name__, container.name)
762785
raise Exception(msg) from ex
763-
builder = DatasetBuilder(name, bldr_data, parent=parent, source=source, dtype=dtype)
786+
builder = DatasetBuilder(
787+
name,
788+
data=bldr_data,
789+
parent=parent,
790+
source=source,
791+
dtype=dtype,
792+
dimension_labels=dimension_labels,
793+
)
764794

765795
# Add attributes from the specification extension to the list of attributes
766796
all_attrs = self.__spec.attributes + getattr(spec_ext, 'attributes', tuple())
@@ -779,14 +809,67 @@ def __check_dset_spec(self, orig, ext):
779809
"""
780810
dtype = orig.dtype
781811
shape = orig.shape
812+
dims = orig.dims
782813
spec = orig
783814
if ext is not None:
784815
if ext.dtype is not None:
785816
dtype = ext.dtype
786817
if ext.shape is not None:
787818
shape = ext.shape
819+
dims = ext.dims
788820
spec = ext
789-
return dtype, shape, spec
821+
return dtype, shape, dims, spec
822+
823+
def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
824+
if spec_shape is None or spec_dims is None:
825+
return None
826+
data_shape = get_data_shape(data)
827+
# if shape is a list of allowed shapes, find the index of the shape that matches the data
828+
if isinstance(spec_shape[0], list):
829+
match_shape_inds = list()
830+
for i, s in enumerate(spec_shape):
831+
# skip this shape if it has a different number of dimensions from the data
832+
if len(s) != len(data_shape):
833+
continue
834+
# check each dimension. None means any length is allowed
835+
match = True
836+
for j, d in enumerate(data_shape):
837+
if s[j] is not None and s[j] != d:
838+
match = False
839+
break
840+
if match:
841+
match_shape_inds.append(i)
842+
# use the most specific match -- the one with the fewest Nones
843+
if match_shape_inds:
844+
if len(match_shape_inds) == 1:
845+
return tuple(spec_dims[match_shape_inds[0]])
846+
else:
847+
count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
848+
index_min_count = count_nones.index(min(count_nones))
849+
best_match_ind = match_shape_inds[index_min_count]
850+
return tuple(spec_dims[best_match_ind])
851+
else:
852+
# no matches found
853+
msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
854+
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
855+
return None
856+
else:
857+
if len(data_shape) != len(spec_shape):
858+
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
859+
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
860+
return None
861+
# check each dimension. None means any length is allowed
862+
match = True
863+
for j, d in enumerate(data_shape):
864+
if spec_shape[j] is not None and spec_shape[j] != d:
865+
match = False
866+
break
867+
if not match:
868+
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
869+
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
870+
return None
871+
# shape is a single list of allowed dimension lengths
872+
return tuple(spec_dims)
790873

791874
def __is_reftype(self, data):
792875
if (isinstance(data, AbstractDataChunkIterator) or

src/hdmf/build/warnings.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ class IncorrectQuantityBuildWarning(BuildWarning):
1515
pass
1616

1717

18+
class IncorrectDatasetShapeBuildWarning(BuildWarning):
19+
"""
20+
Raised when a dataset has a shape that is not allowed by the spec.
21+
"""
22+
pass
23+
24+
1825
class MissingRequiredBuildWarning(BuildWarning):
1926
"""
2027
Raised when a required field is missing.

0 commit comments

Comments
 (0)