Skip to content

Commit 813478b

Browse files
Minor bug fixes
1 parent 11d5cdf commit 813478b

File tree

8 files changed

+57
-37
lines changed

8 files changed

+57
-37
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,10 @@ test.py
1919
tests/data/genomes/out/*
2020
tests/*.log
2121
tests/*.fasta
22+
23+
# Exclude build artifacts
2224
dist/*
25+
*64/*
26+
noarch/*
27+
channeldata/*
28+
index.html

datasail/cluster/clustering.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from datasail.cluster.wlk import run_wlk
2020
from datasail.reader.utils import DataSet
2121
from datasail.report import whatever
22-
from datasail.settings import LOGGER, KW_THREADS, KW_LOGDIR, KW_OUTDIR, WLK, MMSEQS, MMSEQS2, MMSEQSPP, \
22+
from datasail.settings import DIST_OPTIONS, FP_OPTIONS, LOGGER, KW_THREADS, KW_LOGDIR, KW_OUTDIR, SIM_OPTIONS, WLK, MMSEQS, MMSEQS2, MMSEQSPP, \
2323
FOLDSEEK, CDHIT, CDHIT_EST, ECFP, DIAMOND,TANIMOTO, KW_LINKAGE
2424

2525

@@ -112,8 +112,11 @@ def similarity_clustering(dataset: DataSet, threads: int = 1, log_dir: Optional[
112112
run_mmseqs(dataset, threads, log_dir)
113113
elif dataset.similarity.lower() == MMSEQSPP:
114114
run_mmseqspp(dataset, threads, log_dir)
115-
elif dataset.similarity.lower() == TANIMOTO:
116-
run_vector(dataset)
115+
elif dataset.similarity.lower() in SIM_OPTIONS:
116+
if isinstance(dataset.data[dataset.names[0]], str):
117+
run_ecfp(dataset, method=dataset.similarity.lower())
118+
else:
119+
run_vector(dataset, dataset.similarity.lower())
117120
else:
118121
raise ValueError(f"Unknown cluster method: {dataset.similarity}")
119122

@@ -139,6 +142,11 @@ def distance_clustering(dataset: DataSet, threads: int = 1, log_dir: Optional[st
139142
"""
140143
if dataset.distance.lower() == "mash":
141144
run_mash(dataset, threads, log_dir)
145+
elif dataset.distance.lower() in DIST_OPTIONS:
146+
if isinstance(dataset.data[dataset.names[0]], str):
147+
run_ecfp(dataset, method=dataset.distance.lower())
148+
else:
149+
run_vector(dataset, dataset.distance.lower())
142150
else:
143151
raise ValueError(f"Unknown cluster method: {dataset.distance}")
144152

datasail/cluster/ecfp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from datasail.settings import LOGGER
77

88

9-
def run_ecfp(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
9+
def run_ecfp(dataset: DataSet, method: str = "tanimoto") -> None:
1010
"""
1111
Compute 1024Bit-ECPFs for every molecule in the dataset and then compute pairwise Tanimoto-Scores of them.
1212

datasail/cluster/vectors.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,10 @@
66
from rdkit import DataStructs
77

88
from datasail.reader.utils import DataSet
9-
from datasail.settings import LOGGER
9+
from datasail.settings import DIST_OPTIONS, LOGGER, SIM_OPTIONS
1010

11-
SIM_OPTIONS = Literal[
12-
"allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "onbit", "rogotgoldberg",
13-
"russel", "sokal", "tanimoto"
14-
]
1511

16-
# unbounded: chebyshev, cityblock, euclidean, mahalanobis, manhattan, mcconnaughey, minkowski, sqeuclidean
17-
# produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
18-
# boolean only: dice, kulczynski1, russelrao, sokalsneath
19-
# matching == hamming, manhattan == cityblock (inofficial)
20-
DIST_OPTIONS = Literal[
21-
"canberra", "hamming", "jaccard", "matching", "rogerstanimoto", "sokalmichener", "yule"
22-
]
23-
24-
25-
def get_rdkit_fct(method: SIM_OPTIONS) -> Callable[[Any, Any], np.ndarray]:
12+
def get_rdkit_fct(method: str) -> Callable[[Any, Any], np.ndarray]:
2613
"""
2714
Get the RDKit function for the given similarity measure.
2815
@@ -57,7 +44,7 @@ def get_rdkit_fct(method: SIM_OPTIONS) -> Callable[[Any, Any], np.ndarray]:
5744
raise ValueError(f"Unknown method {method}")
5845

5946

60-
def rdkit_sim(fps, method: SIM_OPTIONS) -> np.ndarray:
47+
def rdkit_sim(fps, method: str) -> np.ndarray:
6148
"""
6249
Compute the similarity between elements of a list of rdkit vectors.
6350
@@ -108,7 +95,7 @@ def iterable2bitvect(it) -> DataStructs.ExplicitBitVect:
10895
return output
10996

11097

111-
def run_vector(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
98+
def run_vector(dataset: DataSet, method: str = "tanimoto") -> None:
11299
"""
113100
Compute pairwise Tanimoto-Scores of the given dataset.
114101
@@ -120,7 +107,7 @@ def run_vector(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
120107
method = method.lower()
121108

122109
embed = dataset.data[dataset.names[0]]
123-
if method in get_args(SIM_OPTIONS):
110+
if method in SIM_OPTIONS:
124111
if isinstance(embed, (list, tuple, np.ndarray)):
125112
if isinstance(embed[0], int) or np.issubdtype(embed[0].dtype, int):
126113
if method in ["allbit", "asymmetric", "braunblanquet", "cosine", "kulczynski", "onbit",
@@ -137,7 +124,7 @@ def run_vector(dataset: DataSet, method: SIM_OPTIONS = "tanimoto") -> None:
137124
raise ValueError(
138125
f"Unsupported embedding type {type(embed)}. Please use either RDKit datastructures, lists, "
139126
f"tuples or one-dimensional numpy arrays.")
140-
elif method in get_args(DIST_OPTIONS):
127+
elif method in DIST_OPTIONS:
141128
dtype = np.bool_ if ["jaccard", "rogerstanimoto", "sokalmichener", "yule"] else np.float64
142129
if isinstance(embed, (
143130
list, tuple, DataStructs.ExplicitBitVect, DataStructs.LongSparseIntVect, DataStructs.IntSparseIntVect)):
@@ -159,7 +146,7 @@ def run(
159146
dataset: DataSet,
160147
fps: Union[np.ndarray, DataStructs.ExplicitBitVect, DataStructs.LongSparseIntVect,
161148
DataStructs.IntSparseIntVect],
162-
method: Union[SIM_OPTIONS, DIST_OPTIONS],
149+
method: str,
163150
) -> None:
164151
"""
165152
Compute pairwise similarities of the given fingerprints.
@@ -169,11 +156,11 @@ def run(
169156
fps: The fingerprints to compute pairwise similarities for.
170157
method: The similarity measure to use.
171158
"""
172-
if method in get_args(SIM_OPTIONS):
159+
if method in SIM_OPTIONS:
173160
dataset.cluster_similarity = rdkit_sim(fps, method)
174161
if method == "mcconnaughey":
175162
dataset.cluster_similarity = dataset.cluster_similarity + 1 / 2
176-
elif method in get_args(DIST_OPTIONS):
163+
elif method in DIST_OPTIONS:
177164
if method == "mahalanobis" and len(fps) <= len(fps[0]):
178165
raise ValueError(
179166
f"For clustering with the Mahalanobis method, you have to have more observations that dimensions in "
@@ -185,6 +172,8 @@ def run(
185172
dataset.cluster_distance = dataset.cluster_distance / len(fps[0])
186173
elif method == "yule":
187174
dataset.cluster_distance /= 2
175+
else:
176+
raise ValueError(f"Unknown method to compare fingerprints. Found: {method}")
188177

189178

190179
if __name__ == '__main__':

datasail/eval.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,15 @@ def eval_single_split(datatype, data: Optional[Union[dict[str, Any], str, Path]]
7171
metric *= weight_matrix
7272

7373
metric_total = np.sum(metric)
74-
leakage = np.sum(in_split_mask * metric)
74+
# leakage = np.sum(in_split_mask * metric)
75+
if mode == "sim":
76+
leakage = metric_total - np.sum(in_split_mask * metric)
77+
# return leakage / metric_total, leakage, metric_total
7578
if mode == "dist":
76-
leakage = metric_total - leakage
77-
return 1 - (leakage / metric_total), leakage, metric_total
79+
leakage = np.sum(in_split_mask * metric)
80+
return leakage / metric_total, leakage, metric_total
81+
82+
83+
# if mode == "dist":
84+
# leakage = metric_total - leakage
85+
# return 1 - (leakage / metric_total), metric_total - leakage, metric_total

datasail/reader/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from rdkit import Chem
1313

1414
from datasail.reader.validate import validate_user_args
15-
from datasail.settings import get_default, SIM_ALGOS, DIST_ALGOS, UNK_LOCATION, format2ending, FASTA_FORMATS
15+
from datasail.settings import DIST_OPTIONS, SIM_OPTIONS, get_default, SIM_ALGOS, DIST_ALGOS, UNK_LOCATION, format2ending, FASTA_FORMATS
1616

1717
DATA_INPUT = Optional[Union[str, Path, Dict[str, Union[str, np.ndarray]],
1818
Callable[..., Dict[str, Union[str, np.ndarray]]], Generator[Tuple[str, Union[str, np.ndarray]], None, None]]]
@@ -304,9 +304,9 @@ def read_data(
304304
if sim is None and dist is None:
305305
dataset.similarity, dataset.distance = get_default(dataset.type, dataset.format)
306306
dataset.names = list(dataset.data.keys())
307-
elif sim is not None and not (isinstance(sim, str) and sim.lower() in SIM_ALGOS):
307+
elif sim is not None and not (isinstance(sim, str) and sim.lower() in SIM_ALGOS + SIM_OPTIONS):
308308
dataset.names, dataset.similarity = read_matrix_input(sim)
309-
elif dist is not None and not (isinstance(dist, str) and dist.lower() in DIST_ALGOS):
309+
elif dist is not None and not (isinstance(dist, str) and dist.lower() in DIST_ALGOS + DIST_OPTIONS):
310310
dataset.names, dataset.distance = read_matrix_input(dist)
311311
else:
312312
if sim is not None:

datasail/sail.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,11 +119,11 @@ def validate_args(**kwargs) -> Dict[str, object]:
119119
error("The filepath to the weights of the E-data is invalid.", 8, kwargs[KW_CLI])
120120
if kwargs[KW_E_STRAT] is not None and isinstance(kwargs[KW_E_STRAT], Path) and not kwargs[KW_E_STRAT].is_file():
121121
error("The filepath to the stratification of the E-data is invalid.", 11, kwargs[KW_CLI])
122-
if kwargs[KW_E_SIM] is not None and (isinstance(kwargs[KW_E_SIM], Path) or kwargs[KW_E_SIM].lower() not in SIM_ALGOS):
122+
if kwargs[KW_E_SIM] is not None and (isinstance(kwargs[KW_E_SIM], Path) or kwargs[KW_E_SIM].lower() not in SIM_ALGOS + SIM_OPTIONS):
123123
if not kwargs[KW_E_SIM].is_file():
124124
error(f"The similarity metric for the E-data seems to be a file-input but the filepath is invalid.",
125125
9, kwargs[KW_CLI])
126-
if kwargs[KW_E_DIST] is not None and (isinstance(kwargs[KW_E_DIST], Path) or kwargs[KW_E_DIST].lower() not in DIST_ALGOS):
126+
if kwargs[KW_E_DIST] is not None and (isinstance(kwargs[KW_E_DIST], Path) or kwargs[KW_E_DIST].lower() not in DIST_ALGOS + DIST_OPTIONS):
127127
if not kwargs[KW_E_DIST].is_file():
128128
error(f"The distance metric for the E-data seems to be a file-input but the filepath is invalid.",
129129
10, kwargs[KW_CLI])
@@ -139,11 +139,11 @@ def validate_args(**kwargs) -> Dict[str, object]:
139139
error("The filepath to the weights of the F-data is invalid.", 14, kwargs[KW_CLI])
140140
if kwargs[KW_E_STRAT] is not None and isinstance(kwargs[KW_E_STRAT], Path) and not kwargs[KW_E_STRAT].is_file():
141141
error("The filepath to the stratification of the E-data is invalid.", 20, kwargs[KW_CLI])
142-
if kwargs[KW_F_SIM] is not None and (isinstance(kwargs[KW_F_SIM], Path) or kwargs[KW_F_SIM].lower() not in SIM_ALGOS):
142+
if kwargs[KW_F_SIM] is not None and (isinstance(kwargs[KW_F_SIM], Path) or kwargs[KW_F_SIM].lower() not in SIM_ALGOS + SIM_OPTIONS):
143143
if not kwargs[KW_F_SIM].is_file():
144144
error(f"The similarity metric for the F-data seems to be a file-input but the filepath is invalid.",
145145
15, kwargs[KW_CLI])
146-
if kwargs[KW_F_DIST] is not None and (isinstance(kwargs[KW_F_DIST], Path) or kwargs[KW_F_DIST].lower() not in DIST_ALGOS):
146+
if kwargs[KW_F_DIST] is not None and (isinstance(kwargs[KW_F_DIST], Path) or kwargs[KW_F_DIST].lower() not in DIST_ALGOS + DIST_OPTIONS):
147147
if not kwargs[KW_F_DIST].is_file():
148148
error(f"The distance metric for the F-data seems to be a file-input but the filepath is invalid.",
149149
16, kwargs[KW_CLI])
@@ -155,7 +155,7 @@ def validate_args(**kwargs) -> Dict[str, object]:
155155

156156

157157
def to_path(x):
158-
return Path(x) if isinstance(x, str) and x not in ALGOS else x
158+
return Path(x) if isinstance(x, str) and x not in ALGOS + FP_OPTIONS else x
159159

160160

161161
def datasail(

datasail/settings.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,15 @@ def get_default(data_type: str, data_format: str) -> Tuple[Optional[str], Option
8383
DIST_ALGOS = [MASH, ]
8484
ALGOS = SIM_ALGOS + DIST_ALGOS
8585

86+
SIM_OPTIONS = ["allbit", "asymmetric", "braunblanquet", "cosine", "dice", "kulczynski", "onbit", "rogotgoldberg", "russel", "sokal", "tanimoto"]
87+
88+
# unbounded: chebyshev, cityblock, euclidean, mahalanobis, manhattan, mcconnaughey, minkowski, sqeuclidean
89+
# produces inf or nan: correlation, cosine, jensenshannon, seuclidean, braycurtis
90+
# boolean only: dice, kulczynski1, russelrao, sokalsneath
91+
# matching == hamming, manhattan == cityblock (inofficial)
92+
DIST_OPTIONS = ["canberra", "hamming", "jaccard", "matching", "rogerstanimoto", "sokalmichener", "yule"]
93+
FP_OPTIONS = SIM_OPTIONS + DIST_OPTIONS
94+
8695
# Check if the tools are installed
8796
INSTALLED = {
8897
CDHIT: os.system("cd-hit -h > /dev/null") == 256,

0 commit comments

Comments
 (0)