Merge pull request #12 from bioscan-ml/enh_index2label

scottclowe · web-flow · commit 4c2fc4a35d69 · 2025-03-20T11:54:00.000-04:00
ENH: Add index2label and label2index methods
diff --git a/README.rst b/README.rst
@@ -200,6 +200,9 @@ If this is set to ``target_format="text"``, the output will instead be the raw l
 The default setting is ``target_format="index"``.
 Note that if multiple targets types are given, each label will be returned in the same format.
 
+To map target indices back to text labels, the dataset class provides the ``index2label`` method.
+Similarly, the ``label2index`` method can be used to map text labels to indices.
+
 
 Data transforms
 ~~~~~~~~~~~~~~~
diff --git a/bioscan_dataset/bioscan1m.py b/bioscan_dataset/bioscan1m.py
@@ -10,8 +10,10 @@
 
 import os
 from enum import Enum
-from typing import Any, Tuple
+from typing import Any, Iterable, Tuple, Union
 
+import numpy as np
+import numpy.typing as npt
 import pandas
 import PIL
 import torch
@@ -339,11 +341,74 @@ def __init__(
 
         self._load_metadata()
 
+    def index2label(self, column: str, index: Union[int, Iterable[int]]) -> Union[str, npt.NDArray[np.str_]]:
+        r"""
+        Convert target's integer index to text label.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        column : str
+            The dataset column name to map. This is the same as the ``target_type``.
+        index : int or Iterable[int]
+            The integer index or indices to map to labels.
+
+        Returns
+        -------
+        str or numpy.array[str]
+            The text label or labels corresponding to the integer index or indices
+            in the specified column.
+            Entries containing missing values, indicated by negative indices, are mapped
+            to an empty string.
+        """
+        if not hasattr(index, "__len__"):
+            # Single index
+            if index < 0:
+                return ""
+            return self.metadata[column].cat.categories[index]
+        index = np.asarray(index)
+        out = self.metadata[column].cat.categories[index]
+        out = np.asarray(out)
+        out[index < 0] = ""
+        return out
+
+    def label2index(self, column: str, label: Union[str, Iterable[str]]) -> Union[int, npt.NDArray[np.int_]]:
+        r"""
+        Convert target's text label to integer index.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        column : str
+            The dataset column name to map. This is the same as the ``target_type``.
+        label : str or Iterable[str]
+            The text label or labels to map to integer indices.
+
+        Returns
+        -------
+        int or numpy.array[int]
+            The integer index or indices corresponding to the text label or labels
+            in the specified column.
+            Entries containing missing values, indicated by empty strings, are mapped
+            to ``-1``.
+        """
+        if isinstance(label, str):
+            # Single index
+            if label == "":
+                return -1
+            return self.metadata[column].cat.categories.get_loc(label)
+        labels = label
+        out = [-1 if lab == "" else self.metadata[column].cat.categories.get_loc(lab) for lab in labels]
+        out = np.asarray(out)
+        return out
+
     def __len__(self):
         return len(self.metadata)
 
     def __getitem__(self, index: int) -> Tuple[Any, ...]:
-        """
+        r"""
         Get a sample from the dataset.
 
         Parameters
diff --git a/bioscan_dataset/bioscan5m.py b/bioscan_dataset/bioscan5m.py
@@ -10,8 +10,10 @@
 
 import os
 from enum import Enum
-from typing import Any, Tuple
+from typing import Any, Iterable, Tuple, Union
 
+import numpy as np
+import numpy.typing as npt
 import pandas
 import PIL
 import torch
@@ -406,11 +408,88 @@ def __init__(
 
         self._load_metadata()
 
+    def index2label(self, column: str, index: Union[int, Iterable[int]]) -> Union[str, npt.NDArray[np.str_]]:
+        r"""
+        Convert target's integer index to text label.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        column : str
+            The dataset column name to map. This is the same as the ``target_type``.
+        index : int or Iterable[int]
+            The integer index or indices to map to labels.
+
+        Returns
+        -------
+        str or numpy.array[str]
+            The text label or labels corresponding to the integer index or indices
+            in the specified column.
+            Entries containing missing values, indicated by negative indices, are mapped
+            to an empty string.
+
+        Examples
+        --------
+        >>> dataset.index2label("order", [4])
+        'Diptera'
+        >>> dataset.index2label("order", [4, 9, -1, 4])
+        array(['Diptera', 'Lepidoptera', '', 'Diptera'], dtype=object)
+        """
+        if not hasattr(index, "__len__"):
+            # Single index
+            if index < 0:
+                return ""
+            return self.metadata[column].cat.categories[index]
+        index = np.asarray(index)
+        out = self.metadata[column].cat.categories[index]
+        out = np.asarray(out)
+        out[index < 0] = ""
+        return out
+
+    def label2index(self, column: str, label: Union[str, Iterable[str]]) -> Union[int, npt.NDArray[np.int_]]:
+        r"""
+        Convert target's text label to integer index.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        column : str
+            The dataset column name to map. This is the same as the ``target_type``.
+        label : str or Iterable[str]
+            The text label or labels to map to integer indices.
+
+        Returns
+        -------
+        int or numpy.array[int]
+            The integer index or indices corresponding to the text label or labels
+            in the specified column.
+            Entries containing missing values, indicated by empty strings, are mapped
+            to ``-1``.
+
+        Examples
+        --------
+        >>> dataset.label2index("order", "Diptera")
+        4
+        >>> dataset.label2index("order", ["Diptera", "Lepidoptera", "", "Diptera"])
+        array([4, 9, -1, 4])
+        """
+        if isinstance(label, str):
+            # Single index
+            if label == "":
+                return -1
+            return self.metadata[column].cat.categories.get_loc(label)
+        labels = label
+        out = [-1 if lab == "" else self.metadata[column].cat.categories.get_loc(lab) for lab in labels]
+        out = np.asarray(out)
+        return out
+
     def __len__(self):
         return len(self.metadata)
 
     def __getitem__(self, index: int) -> Tuple[Any, ...]:
-        """
+        r"""
         Get a sample from the dataset.
 
         Parameters
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -68,6 +68,8 @@ def auto_convert_readme(_):
         # Change hard API URLs to dynamically generated class links
         readme_rst = readme_rst.replace("`BIOSCAN1M <BS1M-class_>`_", ":class:`~.bioscan_dataset.BIOSCAN1M`")
         readme_rst = readme_rst.replace("`BIOSCAN5M <BS5M-class_>`_", ":class:`~.bioscan_dataset.BIOSCAN5M`")
+        readme_rst = readme_rst.replace("``index2label``", ":meth:`~.bioscan_dataset.BIOSCAN5M.index2label`")
+        readme_rst = readme_rst.replace("``label2index``", ":meth:`~.bioscan_dataset.BIOSCAN5M.label2index`")
         print(f"Writing {readme_path_output}")
         with open(readme_path_output, "w") as f:
             f.write(readme_rst)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+numpy>=1.21.0
 pandas>=1.0.0
 Pillow>=4.1.1
 torch>=1.4.0

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+numpy>=1.21.0`
`1`	`2`	`pandas>=1.0.0`
`2`	`3`	`Pillow>=4.1.1`
`3`	`4`	`torch>=1.4.0`