Skip to content

Commit 81fb2fe

Browse files
committed
add documentation to some functions
1 parent 7fdb35a commit 81fb2fe

File tree

2 files changed

+146
-28
lines changed

2 files changed

+146
-28
lines changed

pydyno/seqanalysis.py

Lines changed: 138 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ def lcs_dist_same_length(seq1, seq2):
6767
6868
Returns
6969
-------
70-
70+
int
71+
Longest common subsequence distance between seq1 and seq2
7172
"""
7273
seq_len = len(seq1)
7374
d_1_2 = 2 * seq_len - 2 * lcs.lcs_std(seq1, seq2)[0]
@@ -99,11 +100,11 @@ def multiprocessing_distance(data, metric_function, num_processors):
99100

100101
class SeqAnalysis:
101102
"""
102-
Analyze and visualize discretized trajectories
103+
Analyze and visualize discretized simulations
103104
104105
Parameters
105106
----------
106-
sequences : str, pd.DataFrame, np.ndarray, list
107+
sequences : str or pd.DataFrame or np.ndarray or list
107108
Sequence data from the discretization of a PySB model. If str it must be a csv file
108109
with the sequences as rows and the first row must have the time points of the simulation.
109110
target : str
@@ -188,7 +189,8 @@ def unique_sequences(self):
188189
189190
Returns
190191
-------
191-
pd.DataFrame with the unique sequences
192+
pd.DataFrame
193+
Pandas dataframe with unique sequences
192194
193195
"""
194196
data_seqs = self._sequences.groupby(self._sequences.columns.tolist(),
@@ -209,7 +211,8 @@ def truncate_sequences(self, idx):
209211
210212
Returns
211213
-------
212-
Sequences truncated at the idx indicated
214+
pydyno.seqanalysis.SeqAnalysis
215+
Sequences truncated at the idx indicated
213216
"""
214217
data_seqs = self._sequences[self._sequences.columns.tolist()[:idx]]
215218
return SeqAnalysis(data_seqs, self.target)
@@ -227,7 +230,8 @@ def dissimilarity_matrix(self, metric='LCS', num_processors=1):
227230
228231
Returns
229232
-------
230-
dissimilarity matrix: np.ndarray
233+
np.ndarray
234+
dissimilarrity matrix
231235
"""
232236

233237
# Sort sequences
@@ -289,7 +293,7 @@ def select_seqs_group(self, group):
289293
290294
Returns
291295
-------
292-
seqAnalysis
296+
pydyno.seqanalysis.seqAnalysis
293297
a new object with the selected sequences
294298
"""
295299
if isinstance(group, int):
@@ -391,6 +395,29 @@ def states_colors(self, colors_dict):
391395

392396
def cluster_representativeness(self, method='freq', dmax=None, pradius=0.1,
393397
coverage=0.25, nrep=None):
398+
"""
399+
Obtain Representative sequences from each cluster
400+
401+
Parameters
402+
----------
403+
method: str
404+
Name of the method used to calculate cluster representatives. Supported methods:
405+
`freq`, `density`, `dist`
406+
dmax: float
407+
Maximum theoretical distance
408+
pradius: float
409+
neighborhood radius as a percentage of the maximum ditance dmax. Default is 0.1
410+
coverage: float
411+
Size of the representative set, i.e the proportion of objects having a
412+
representative in their neighborhood
413+
nrep: int
414+
Number of representatives. If None, coverage argument is used
415+
416+
Returns
417+
-------
418+
dict
419+
Dictionary where keys are the cluster labels and values are the representative sequences
420+
"""
394421

395422
clusters = set(self.labels)
396423
clus_rep = {}
@@ -411,12 +438,22 @@ def seq_representativeness(diss, method='freq', dmax=None, pradius=0.1,
411438
Parameters
412439
----------
413440
method: str
414-
Method used to obtain the representativeness of the sequences in each cluster
441+
Name of the method used to calculate cluster representatives. Supported methods:
442+
`freq`, `density`, `dist`
415443
dmax: float
416444
Maximum theoretical distance
445+
pradius: float
446+
neighborhood radius as a percentage of the maximum ditance dmax. Default is 0.1
447+
coverage: float
448+
Size of the representative set, i.e the proportion of objects having a
449+
representative in their neighborhood
450+
nrep: int
451+
Number of representatives. If None, coverage argument is used
417452
418453
Returns
419454
-------
455+
np.ndarray
456+
Indices of the representative sequences
420457
421458
"""
422459
n_seq = diss.shape[0]
@@ -488,9 +525,11 @@ def seq_representativeness(diss, method='freq', dmax=None, pradius=0.1,
488525

489526
def plot_sequences(self, type_fig='modal', plot_all=False, title='', dir_path='', sort_seq=None):
490527
"""
491-
Function to plot three different figures of the sequences.
492-
The modal figure takes the mode state at each time and plots
493-
the percentage of that state compated to all the other states.
528+
Plot sequences. A subplot is generated for each cluster.
529+
530+
Three types of plots can be generated: `modal`, `trajectories`, and `entropy`.
531+
The modal figure takes the mode state at each time point and plots
532+
the percentage of that state relative to all the other states.
494533
495534
The trajectories figure plots each of the sequences.
496535
@@ -525,14 +564,21 @@ def plot_sequences(self, type_fig='modal', plot_all=False, title='', dir_path=''
525564
def hdbscan_clustering(self, min_cluster_size=50, min_samples=5,
526565
alpha=1.0, cluster_selection_method='eom', **kwargs):
527566
"""
567+
Perform HDBSCAN clustering. For more information see `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/index.html>`_
528568
529569
Parameters
530570
----------
531-
min_cluster_size
532-
min_samples
533-
alpha
534-
cluster_selection_method
535-
kwargs
571+
min_cluster_size : int, optional(default=5)
572+
The minimum size of clusters
573+
min_samples : int, optional(default=None)
574+
The number of samples in a neighborhood for a point to
575+
be considered a core point
576+
alpha : float, optional(default=1.0)
577+
A distance scaling parameter as used in robust single linkage
578+
cluster_selection_method : str, optional (default='eom')
579+
The method used to select clusters from the condensed tree
580+
kwargs : dict
581+
Extra arguments passed to the hdbscan clustering function
536582
537583
Returns
538584
-------
@@ -552,6 +598,7 @@ def hdbscan_clustering(self, min_cluster_size=50, min_samples=5,
552598

553599
def Kmedoids(self, n_clusters):
554600
"""
601+
Perform kmedoids clustering.
555602
556603
Parameters
557604
----------
@@ -573,22 +620,60 @@ def Kmedoids(self, n_clusters):
573620
return
574621

575622
def agglomerative_clustering(self, n_clusters, linkage='average', **kwargs):
623+
"""
624+
Perform agglomerative clustering. For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html>`_
625+
626+
Parameters
627+
----------
628+
n_clusters : int
629+
The number of clusters to find
630+
linkage : {‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’
631+
Linkage criterion to use
632+
kwargs : dict
633+
Other arguments to pass to the agglomerative clustering function
634+
635+
Returns
636+
-------
637+
638+
"""
576639
ac = cluster.AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed',
577640
linkage=linkage, **kwargs).fit(self.diss)
578641
self._labels = ac.labels_
579642
self._cluster_method = 'agglomerative'
580643
return
581644

582645
def spectral_clustering(self, n_clusters, random_state=None, num_processors=1, **kwargs):
646+
"""
647+
Perform spectral clustering. For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html>`_
648+
649+
Parameters
650+
----------
651+
n_clusters :
652+
The dimension of the projection subspace
653+
random_state : int, RandomState instance, default=None
654+
A pseudo random number generator used for the initialization of the
655+
lobpcg eigen vectors decomposition when eigen_solver='amg' and by the
656+
K-Means initialization
657+
num_processors : int, default=1
658+
The number of parallel jobs to run
659+
kwargs : dict
660+
Other arguments to pass to the spectral clustering function
661+
662+
Returns
663+
-------
664+
665+
"""
583666
gamma = 1. / len(self.diss[0])
584667
kernel = np.exp(-self.diss * gamma)
585668
sc = cluster.SpectralClustering(n_clusters=n_clusters, random_state=random_state,
586669
affinity='precomputed', n_jobs=num_processors, **kwargs).fit(kernel)
587670
self._labels = sc.labels_
588671
self._cluster_method = 'spectral'
672+
return
589673

590674
def silhouette_score(self):
591675
"""
676+
Compute the mean Silhouette Coefficient of all samples. For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html>`_
592677
593678
Returns
594679
-------
@@ -610,6 +695,26 @@ def silhouette_score(self):
610695
return score
611696

612697
def silhouette_score_spectral_range(self, cluster_range, num_processors=1, random_state=None, **kwargs):
698+
"""
699+
Calculate silhouette score for a range of cluster numbers using spectral clustering.
700+
For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html>`_
701+
702+
Parameters
703+
----------
704+
cluster_range: int or array-like
705+
Range of cluster numbers
706+
num_processors : int, default=1
707+
The number of parallel jobs to run
708+
random_state : int, RandomState instance or None, default=None
709+
Determines random number generation for selecting a subset of samples.
710+
kwargs : dict
711+
Other parameters passed to the silhouette score function
712+
713+
Returns
714+
-------
715+
pd.DataFrame
716+
Dataframe with the silhouette score for each cluster number
717+
"""
613718
if isinstance(cluster_range, int):
614719
cluster_range = list(range(2, cluster_range + 1)) # +1 to cluster up to cluster_range
615720
elif hasattr(cluster_range, "__len__") and not isinstance(cluster_range, str):
@@ -631,20 +736,24 @@ def silhouette_score_spectral_range(self, cluster_range, num_processors=1, rando
631736
def silhouette_score_agglomerative_range(self, cluster_range, linkage='average',
632737
num_processors=1, **kwargs):
633738
"""
739+
Calculate silhouette score for a range of cluster numbers using agglomerative clustering.
740+
For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html>`_
634741
635742
Parameters
636743
----------
637744
cluster_range : list-like or int
638-
Range of the number of clusterings to obtain the silhouette score
745+
Range of cluster numbers
639746
linkage : str
640747
Type of agglomerative linkage
641748
num_processors : int
642-
Number of cores to use
643-
kwargs : key arguments to pass to the aggomerative clustering function
749+
The number of parallel jobs to run
750+
kwargs : dict
751+
Other arguments to pass to the aggomerative clustering function
644752
645753
Returns
646754
-------
647-
755+
pd.DataFrame
756+
Dataframe with the silhouette score for each cluster number
648757
"""
649758

650759
if isinstance(cluster_range, int):
@@ -669,6 +778,15 @@ def silhouette_score_agglomerative_range(self, cluster_range, linkage='average',
669778
return clusters_df
670779

671780
def calinski_harabaz_score(self):
781+
"""
782+
Compute the Calinski and Harabasz score. Fore more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.calinski_harabasz_score.html>`_
783+
784+
Returns
785+
-------
786+
float
787+
Calinski harabasz score to measure quality of the clustering
788+
789+
"""
672790
if self._labels is None:
673791
raise Exception('you must cluster the signatures first')
674792
score = metrics.calinski_harabasz_score(self.sequences, self._labels)

pydyno/visualize_simulations.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
class VisualizeSimulations(object):
2626
"""
27-
Visualize PySB simulations and parameter distributions in different clusters
27+
Visualize PySB simulations and parameter distributions in different execution modes
2828
2929
Parameters
3030
----------
@@ -33,10 +33,8 @@ class VisualizeSimulations(object):
3333
sim_results: SimulationResult or h5 file from PySB simulation
3434
SimulationResult object or h5 file with the dynamic solutions of the model for all the parameter sets
3535
clusters: vector-like or str or None
36-
Indices of the parameters that belong to an specific cluster. It can be a list of files that contain
37-
the indices of each cluster, a list of lists where each list has the parameter indices of a cluster or
38-
a file that contains the cluster labels to which each parameter belongs to, or None if the user want to
39-
analyse the sim_results as a single cluster.
36+
Cluster labels for each simulation. It can be a files that contain the cluster labelso, or None
37+
if the user want to analyse the sim_results as a single cluster.
4038
truncate_idx: int
4139
Index at which the simulation is truncated. Only works when clusters is None. It cannot be used at the same
4240
time with truncate_idx.
@@ -144,7 +142,7 @@ def time_change(self):
144142
return self._time_change
145143

146144
@staticmethod
147-
def check_clusters_arg(clusters, nsims): # check clusters
145+
def check_clusters_arg(clusters, nsims): # check cluster labels
148146
def _clusters_to_dict(cluster_labels, n):
149147
# Takes a list of cluster labels and create a dictionary where the keys are the labels, and
150148
# the values are the indices in the original list
@@ -184,7 +182,7 @@ def plot_cluster_dynamics(self, components, x_data=None, y_data=None, y_error=No
184182
type_fig='trajectories', add_y_histogram=False, fig_name='', plot_format='png',
185183
species_ftn_fit=None, norm=False, norm_value=None, fit_options={}, figure_options={}):
186184
"""
187-
Plots the dynamics of species/observables/pysb expressions for each cluster
185+
Plots the dynamics of species/observables/pysb-expressions for each execution mode
188186
189187
Parameters
190188
----------
@@ -217,8 +215,10 @@ def plot_cluster_dynamics(self, components, x_data=None, y_data=None, y_error=No
217215
norm_value: array-like or str
218216
Array of values used to normalized species concentrations. Must have same order
219217
as species
220-
kwargs: dict
218+
fit_options: dict
221219
Arguments to pass to the fitting function
220+
figure_options: dict
221+
Arguments to pass to the Matplotlib subplots
222222
223223
Returns
224224
-------

0 commit comments

Comments
 (0)