Minor bug fixes

Old-Shatterhand · Old-Shatterhand · commit 813478bce22b · 2025-12-14T10:29:23.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -19,4 +19,10 @@ test.py
 tests/data/genomes/out/*
 tests/*.log
 tests/*.fasta
+
+# Exclude build artifacts
 dist/*
+*64/*
+noarch/*
+channeldata/*
+index.html
diff --git a/datasail/cluster/clustering.py b/datasail/cluster/clustering.py
@@ -19,7 +19,7 @@
 from datasail.cluster.wlk import run_wlk
 from datasail.reader.utils import DataSet
 from datasail.report import whatever
-from datasail.settings import LOGGER, KW_THREADS, KW_LOGDIR, KW_OUTDIR, WLK, MMSEQS, MMSEQS2, MMSEQSPP, \
+from datasail.settings import DIST_OPTIONS, FP_OPTIONS, LOGGER, KW_THREADS, KW_LOGDIR, KW_OUTDIR, SIM_OPTIONS, WLK, MMSEQS, MMSEQS2, MMSEQSPP, \
     FOLDSEEK, CDHIT, CDHIT_EST, ECFP, DIAMOND,TANIMOTO, KW_LINKAGE
 
 
@@ -112,8 +112,11 @@ def similarity_clustering(dataset: DataSet, threads: int = 1, log_dir: Optional[
         run_mmseqs(dataset, threads, log_dir)
     elif dataset.similarity.lower() == MMSEQSPP:
         run_mmseqspp(dataset, threads, log_dir)
-    elif dataset.similarity.lower() == TANIMOTO:
-        run_vector(dataset)
+    elif dataset.similarity.lower() in SIM_OPTIONS:
+        if isinstance(dataset.data[dataset.names[0]], str):
+            run_ecfp(dataset, method=dataset.similarity.lower())
+        else:
+            run_vector(dataset, dataset.similarity.lower())
     else:
         raise ValueError(f"Unknown cluster method: {dataset.similarity}")
 
@@ -139,6 +142,11 @@ def distance_clustering(dataset: DataSet, threads: int = 1, log_dir: Optional[st
     """
     if dataset.distance.lower() == "mash":
         run_mash(dataset, threads, log_dir)
+    elif dataset.distance.lower() in DIST_OPTIONS:
+        if isinstance(dataset.data[dataset.names[0]], str):
+            run_ecfp(dataset, method=dataset.distance.lower())
+        else:
+            run_vector(dataset, dataset.distance.lower())
     else:
         raise ValueError(f"Unknown cluster method: {dataset.distance}")
 
diff --git a/datasail/cluster/ecfp.py b/datasail/cluster/ecfp.py
@@ -6,7 +6,7 @@
 from datasail.settings import LOGGER
 
 
-def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
+def run_ecfp(dataset: DataSet, method: str = "tanimoto") -> None:
     """
     Compute 1024Bit-ECPFs for every molecule in the dataset and then compute pairwise Tanimoto-Scores of them.
 
diff --git a/datasail/cluster/vectors.py b/datasail/cluster/vectors.py
@@ -6,23 +6,10 @@
 from rdkit import DataStructs
 
 from datasail.reader.utils import DataSet
-from datasail.settings import LOGGER
+from datasail.settings import DIST_OPTIONS, LOGGER, SIM_OPTIONS
 
-SIM_OPTIONS = Literal[
-    "allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "onbit", "rogotgoldberg",
-    "russel", "sokal", "tanimoto"
-]
 
-# unbounded: chebyshev, cityblock, euclidean, mahalanobis, manhattan, mcconnaughey, minkowski, sqeuclidean
-# produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
-# boolean only: dice, kulczynski1, russelrao, sokalsneath
-# matching == hamming, manhattan == cityblock (inofficial)
-DIST_OPTIONS = Literal[
-    "canberra", "hamming", "jaccard", "matching", "rogerstanimoto", "sokalmichener", "yule"
-]
-
-
-def get_rdkit_fct(method: SIM_OPTIONS) -> Callable[[Any, Any], np.ndarray]:
+def get_rdkit_fct(method: str) -> Callable[[Any, Any], np.ndarray]:
     """
     Get the RDKit function for the given similarity measure.
 
@@ -57,7 +44,7 @@ def get_rdkit_fct(method: SIM_OPTIONS) -> Callable[[Any, Any], np.ndarray]:
     raise ValueError(f"Unknown method {method}")
 
 
-def rdkit_sim(fps, method: SIM_OPTIONS) -> np.ndarray:
+def rdkit_sim(fps, method: str) -> np.ndarray:
     """
     Compute the similarity between elements of a list of rdkit vectors.
 
@@ -108,7 +95,7 @@ def iterable2bitvect(it) -> DataStructs.ExplicitBitVect:
     return output
 
 
-def run_vector(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
+def run_vector(dataset: DataSet, method: str = "tanimoto") -> None:
     """
     Compute pairwise Tanimoto-Scores of the given dataset.
 
@@ -120,7 +107,7 @@ def run_vector(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
     method = method.lower()
 
     embed = dataset.data[dataset.names[0]]
-    if method in get_args(SIM_OPTIONS):
+    if method in SIM_OPTIONS:
         if isinstance(embed, (list, tuple, np.ndarray)):
             if isinstance(embed[0], int) or np.issubdtype(embed[0].dtype, int):
                 if method in ["allbit", "asymmetric", "braunblanquet", "cosine", "kulczynski", "onbit",
@@ -137,7 +124,7 @@ def run_vector(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
             raise ValueError(
                 f"Unsupported embedding type {type(embed)}. Please use either RDKit datastructures, lists, "
                 f"tuples or one-dimensional numpy arrays.")
-    elif method in get_args(DIST_OPTIONS):
+    elif method in DIST_OPTIONS:
         dtype = np.bool_ if ["jaccard", "rogerstanimoto", "sokalmichener", "yule"] else np.float64
         if isinstance(embed, (
                 list, tuple, DataStructs.ExplicitBitVect, DataStructs.LongSparseIntVect, DataStructs.IntSparseIntVect)):
@@ -159,7 +146,7 @@ def run(
         dataset: DataSet,
         fps: Union[np.ndarray, DataStructs.ExplicitBitVect, DataStructs.LongSparseIntVect,
         DataStructs.IntSparseIntVect],
-        method: Union[SIM_OPTIONS, DIST_OPTIONS],
+        method: str,
 ) -> None:
     """
     Compute pairwise similarities of the given fingerprints.
@@ -169,11 +156,11 @@ def run(
         fps: The fingerprints to compute pairwise similarities for.
         method: The similarity measure to use.
     """
-    if method in get_args(SIM_OPTIONS):
+    if method in SIM_OPTIONS:
         dataset.cluster_similarity = rdkit_sim(fps, method)
         if method == "mcconnaughey":
             dataset.cluster_similarity = dataset.cluster_similarity + 1 / 2
-    elif method in get_args(DIST_OPTIONS):
+    elif method in DIST_OPTIONS:
         if method == "mahalanobis" and len(fps) <= len(fps[0]):
             raise ValueError(
                 f"For clustering with the Mahalanobis method, you have to have more observations that dimensions in "
@@ -185,6 +172,8 @@ def run(
             dataset.cluster_distance = dataset.cluster_distance / len(fps[0])
         elif method == "yule":
             dataset.cluster_distance /= 2
+    else:
+        raise ValueError(f"Unknown method to compare fingerprints. Found: {method}")
 
 
 if __name__ == '__main__':
diff --git a/datasail/eval.py b/datasail/eval.py
@@ -71,7 +71,15 @@ def eval_single_split(datatype, data: Optional[Union[dict[str, Any], str, Path]]
     metric *= weight_matrix
 
     metric_total = np.sum(metric)
-    leakage = np.sum(in_split_mask * metric)
+    # leakage = np.sum(in_split_mask * metric)
+    if mode == "sim":
+        leakage = metric_total - np.sum(in_split_mask * metric)
+        # return leakage / metric_total, leakage, metric_total
     if mode == "dist":
-        leakage = metric_total - leakage
-    return 1 - (leakage / metric_total), leakage, metric_total
+        leakage = np.sum(in_split_mask * metric)
+    return leakage / metric_total, leakage, metric_total
+
+    
+    # if mode == "dist":
+    #     leakage = metric_total - leakage
+    # return 1 - (leakage / metric_total), metric_total - leakage, metric_total
diff --git a/datasail/reader/utils.py b/datasail/reader/utils.py
@@ -12,7 +12,7 @@
 from rdkit import Chem
 
 from datasail.reader.validate import validate_user_args
-from datasail.settings import get_default, SIM_ALGOS, DIST_ALGOS, UNK_LOCATION, format2ending, FASTA_FORMATS
+from datasail.settings import DIST_OPTIONS, SIM_OPTIONS, get_default, SIM_ALGOS, DIST_ALGOS, UNK_LOCATION, format2ending, FASTA_FORMATS
 
 DATA_INPUT = Optional[Union[str, Path, Dict[str, Union[str, np.ndarray]],
     Callable[..., Dict[str, Union[str, np.ndarray]]], Generator[Tuple[str, Union[str, np.ndarray]], None, None]]]
@@ -304,9 +304,9 @@ def read_data(
     if sim is None and dist is None:
         dataset.similarity, dataset.distance = get_default(dataset.type, dataset.format)
         dataset.names = list(dataset.data.keys())
-    elif sim is not None and not (isinstance(sim, str) and sim.lower() in SIM_ALGOS):
+    elif sim is not None and not (isinstance(sim, str) and sim.lower() in SIM_ALGOS + SIM_OPTIONS):
         dataset.names, dataset.similarity = read_matrix_input(sim)
-    elif dist is not None and not (isinstance(dist, str) and dist.lower() in DIST_ALGOS):
+    elif dist is not None and not (isinstance(dist, str) and dist.lower() in DIST_ALGOS + DIST_OPTIONS):
         dataset.names, dataset.distance = read_matrix_input(dist)
     else:
         if sim is not None:
diff --git a/datasail/sail.py b/datasail/sail.py
@@ -119,11 +119,11 @@ def validate_args(**kwargs) -> Dict[str, object]:
         error("The filepath to the weights of the E-data is invalid.", 8, kwargs[KW_CLI])
     if kwargs[KW_E_STRAT] is not None and isinstance(kwargs[KW_E_STRAT], Path) and not kwargs[KW_E_STRAT].is_file():
         error("The filepath to the stratification of the E-data is invalid.", 11, kwargs[KW_CLI])
-    if kwargs[KW_E_SIM] is not None and (isinstance(kwargs[KW_E_SIM], Path) or kwargs[KW_E_SIM].lower() not in SIM_ALGOS):
+    if kwargs[KW_E_SIM] is not None and (isinstance(kwargs[KW_E_SIM], Path) or kwargs[KW_E_SIM].lower() not in SIM_ALGOS + SIM_OPTIONS):
         if not kwargs[KW_E_SIM].is_file():
             error(f"The similarity metric for the E-data seems to be a file-input but the filepath is invalid.",
                   9, kwargs[KW_CLI])
-    if kwargs[KW_E_DIST] is not None and (isinstance(kwargs[KW_E_DIST], Path) or kwargs[KW_E_DIST].lower() not in DIST_ALGOS):
+    if kwargs[KW_E_DIST] is not None and (isinstance(kwargs[KW_E_DIST], Path) or kwargs[KW_E_DIST].lower() not in DIST_ALGOS + DIST_OPTIONS):
         if not kwargs[KW_E_DIST].is_file():
             error(f"The distance metric for the E-data seems to be a file-input but the filepath is invalid.",
                   10, kwargs[KW_CLI])
@@ -139,11 +139,11 @@ def validate_args(**kwargs) -> Dict[str, object]:
         error("The filepath to the weights of the F-data is invalid.", 14, kwargs[KW_CLI])
     if kwargs[KW_E_STRAT] is not None and isinstance(kwargs[KW_E_STRAT], Path) and not kwargs[KW_E_STRAT].is_file():
         error("The filepath to the stratification of the E-data is invalid.", 20, kwargs[KW_CLI])
-    if kwargs[KW_F_SIM] is not None and (isinstance(kwargs[KW_F_SIM], Path) or kwargs[KW_F_SIM].lower() not in SIM_ALGOS):
+    if kwargs[KW_F_SIM] is not None and (isinstance(kwargs[KW_F_SIM], Path) or kwargs[KW_F_SIM].lower() not in SIM_ALGOS + SIM_OPTIONS):
         if not kwargs[KW_F_SIM].is_file():
             error(f"The similarity metric for the F-data seems to be a file-input but the filepath is invalid.",
                   15, kwargs[KW_CLI])
-    if kwargs[KW_F_DIST] is not None and (isinstance(kwargs[KW_F_DIST], Path) or kwargs[KW_F_DIST].lower() not in DIST_ALGOS):
+    if kwargs[KW_F_DIST] is not None and (isinstance(kwargs[KW_F_DIST], Path) or kwargs[KW_F_DIST].lower() not in DIST_ALGOS + DIST_OPTIONS):
         if not kwargs[KW_F_DIST].is_file():
             error(f"The distance metric for the F-data seems to be a file-input but the filepath is invalid.",
                   16, kwargs[KW_CLI])
@@ -155,7 +155,7 @@ def validate_args(**kwargs) -> Dict[str, object]:
 
 
 def to_path(x):
-    return Path(x) if isinstance(x, str) and x not in ALGOS else x
+    return Path(x) if isinstance(x, str) and x not in ALGOS + FP_OPTIONS else x
 
 
 def datasail(
diff --git a/datasail/settings.py b/datasail/settings.py
@@ -83,6 +83,15 @@ def get_default(data_type: str, data_format: str) -> Tuple[Optional[str], Option
 DIST_ALGOS = [MASH, ]
 ALGOS = SIM_ALGOS + DIST_ALGOS
 
+SIM_OPTIONS = ["allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "onbit", "rogotgoldberg", "russel", "sokal", "tanimoto"]
+
+# unbounded: chebyshev, cityblock, euclidean, mahalanobis, manhattan, mcconnaughey, minkowski, sqeuclidean
+# produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
+# boolean only: dice, kulczynski1, russelrao, sokalsneath
+# matching == hamming, manhattan == cityblock (inofficial)
+DIST_OPTIONS = ["canberra", "hamming", "jaccard", "matching", "rogerstanimoto", "sokalmichener", "yule"]
+FP_OPTIONS = SIM_OPTIONS + DIST_OPTIONS
+
 # Check if the tools are installed
 INSTALLED = {
     CDHIT: os.system("cd-hit -h > /dev/null") == 256,