Skip to content

Commit b5b0de4

Browse files
authored
Merge pull request #13 from bioscan-ml/enh_arbitrary-modalities
ENH: Add support for arbitrary modalities
2 parents 4c2fc4a + cc7ea29 commit b5b0de4

File tree

3 files changed

+53
-11
lines changed

3 files changed

+53
-11
lines changed

README.rst

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,27 @@ or ``"dna"``:
157157
158158
dataset = BIOSCAN5M(root="~/Datasets/bioscan/", modality="dna")
159159
160+
Additionally, any column names from the metadata can be used as input modalities.
161+
For example, to load the latitude and longitude coordinates as inputs:
162+
163+
.. code-block:: python
164+
165+
dataset = BIOSCAN5M(root="~/Datasets/bioscan/", modality=("coord-lat", "coord-lon"))
166+
167+
or to load the size of the insect (in pixels) in addition to the DNA barcode:
168+
169+
.. code-block:: python
170+
171+
dataset = BIOSCAN5M(
172+
root="~/Datasets/bioscan/", modality=("dna", "image_measurement_value")
173+
)
174+
175+
Multiple modalities can be selected by passing a list of column names.
176+
Each item in the dataset will have the inputs in the same order as specified in the ``modality`` argument.
177+
178+
All samples have an image and a DNA barcode, but other fields may be incomplete.
179+
Any missing values will be replaced with NaN.
180+
160181

161182
Target selection
162183
~~~~~~~~~~~~~~~~
@@ -239,14 +260,6 @@ The dataset class supports the use of data transforms for the image and DNA barc
239260
)
240261
241262
242-
Size and geolocation metadata
243-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
244-
245-
The BIOSCAN-5M dataset also contains insect size and geolocation metadata.
246-
Loading this metadata is not yet supported by the `BIOSCAN5M <BS5M-class_>`_ pytorch dataset class.
247-
In the meantime, users of the dataset are welcome to explore this metadata themselves.
248-
249-
250263
Other resources
251264
---------------
252265

bioscan_dataset/bioscan1m.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,10 @@ class BIOSCAN1M(VisionDataset):
237237
238238
modality : str or Iterable[str], default=("image", "dna")
239239
Which data modalities to use. One of, or a list of:
240-
``"image"``, ``"dna"``.
240+
``"image"``, ``"dna"``, or any column name in the metadata CSV file.
241+
242+
.. versionchanged:: 1.1.0
243+
Added support for arbitrary modalities.
241244
242245
reduce_repeated_barcodes : bool, default=False
243246
Whether to reduce the dataset to only one sample per barcode.
@@ -426,6 +429,13 @@ def __getitem__(self, index: int) -> Tuple[Any, ...]:
426429
The DNA barcode, if the ``"dna"`` modality is requested, optionally
427430
transformed by the ``dna_transform`` pipeline.
428431
432+
*modalities : Any
433+
Any other modalities requested, as specified in the ``modality`` parameter.
434+
The data is extracted from the appropriate column in the metadata TSV file,
435+
without any transformations.
436+
437+
.. versionadded:: 1.1.0
438+
429439
target : int or Tuple[int, ...] or str or Tuple[str, ...] or None
430440
The target(s), optionally transformed by the ``target_transform`` pipeline.
431441
If ``target_format="index"``, the target(s) will be returned as integer
@@ -446,6 +456,8 @@ def __getitem__(self, index: int) -> Tuple[Any, ...]:
446456
X = sample["nucraw"]
447457
if self.dna_transform is not None:
448458
X = self.dna_transform(X)
459+
elif modality in self.metadata.columns:
460+
X = sample[modality]
449461
else:
450462
raise ValueError(f"Unfamiliar modality: {modality}")
451463
values.append(X)
@@ -499,7 +511,7 @@ def _check_exists(self, verbose=0) -> bool:
499511

500512
def _load_metadata(self) -> pandas.DataFrame:
501513
r"""
502-
Load metadata from CSV file and prepare it for training.
514+
Load metadata from TSV file and prepare it for training.
503515
"""
504516
self.metadata = load_metadata(
505517
self.metadata_path,

bioscan_dataset/bioscan5m.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@
6262
"species",
6363
"dna_bin",
6464
"dna_barcode",
65+
"country",
66+
"province_state",
67+
"coord-lat",
68+
"coord-lon",
69+
"image_measurement_value",
6570
"split",
6671
]
6772

@@ -225,7 +230,10 @@ class BIOSCAN5M(VisionDataset):
225230
226231
modality : str or Iterable[str], default=("image", "dna")
227232
Which data modalities to use. One of, or a list of:
228-
``"image"``, ``"dna"``.
233+
``"image"``, ``"dna"``, or any column name in the metadata CSV file.
234+
235+
.. versionchanged:: 1.1.0
236+
Added support for arbitrary modalities.
229237
230238
image_package : str, default="cropped_256"
231239
The package to load images from. One of:
@@ -507,6 +515,13 @@ def __getitem__(self, index: int) -> Tuple[Any, ...]:
507515
The DNA barcode, if the ``"dna"`` modality is requested, optionally
508516
transformed by the ``dna_transform`` pipeline.
509517
518+
*modalities : Any
519+
Any other modalities requested, as specified in the ``modality`` parameter.
520+
The data is extracted from the appropriate column in the metadata CSV file,
521+
without any transformations.
522+
523+
.. versionadded:: 1.1.0
524+
510525
target : int or Tuple[int, ...] or str or Tuple[str, ...] or None
511526
The target(s), optionally transformed by the ``target_transform`` pipeline.
512527
If ``target_format="index"``, the target(s) will be returned as integer
@@ -527,6 +542,8 @@ def __getitem__(self, index: int) -> Tuple[Any, ...]:
527542
X = sample["dna_barcode"]
528543
if self.dna_transform is not None:
529544
X = self.dna_transform(X)
545+
elif modality in self.metadata.columns:
546+
X = sample[modality]
530547
else:
531548
raise ValueError(f"Unfamiliar modality: {modality}")
532549
values.append(X)

0 commit comments

Comments
 (0)