@@ -67,7 +67,8 @@ def lcs_dist_same_length(seq1, seq2):
67
67
68
68
Returns
69
69
-------
70
-
70
+ int
71
+ Longest common subsequence distance between seq1 and seq2
71
72
"""
72
73
seq_len = len (seq1 )
73
74
d_1_2 = 2 * seq_len - 2 * lcs .lcs_std (seq1 , seq2 )[0 ]
@@ -99,11 +100,11 @@ def multiprocessing_distance(data, metric_function, num_processors):
99
100
100
101
class SeqAnalysis :
101
102
"""
102
- Analyze and visualize discretized trajectories
103
+ Analyze and visualize discretized simulations
103
104
104
105
Parameters
105
106
----------
106
- sequences : str, pd.DataFrame, np.ndarray, list
107
+ sequences : str or pd.DataFrame or np.ndarray or list
107
108
Sequence data from the discretization of a PySB model. If str it must be a csv file
108
109
with the sequences as rows and the first row must have the time points of the simulation.
109
110
target : str
@@ -188,7 +189,8 @@ def unique_sequences(self):
188
189
189
190
Returns
190
191
-------
191
- pd.DataFrame with the unique sequences
192
+ pd.DataFrame
193
+ Pandas dataframe with unique sequences
192
194
193
195
"""
194
196
data_seqs = self ._sequences .groupby (self ._sequences .columns .tolist (),
@@ -209,7 +211,8 @@ def truncate_sequences(self, idx):
209
211
210
212
Returns
211
213
-------
212
- Sequences truncated at the idx indicated
214
+ pydyno.seqanalysis.SeqAnalysis
215
+ Sequences truncated at the idx indicated
213
216
"""
214
217
data_seqs = self ._sequences [self ._sequences .columns .tolist ()[:idx ]]
215
218
return SeqAnalysis (data_seqs , self .target )
@@ -227,7 +230,8 @@ def dissimilarity_matrix(self, metric='LCS', num_processors=1):
227
230
228
231
Returns
229
232
-------
230
- dissimilarity matrix: np.ndarray
233
+ np.ndarray
234
+ dissimilarrity matrix
231
235
"""
232
236
233
237
# Sort sequences
@@ -289,7 +293,7 @@ def select_seqs_group(self, group):
289
293
290
294
Returns
291
295
-------
292
- seqAnalysis
296
+ pydyno.seqanalysis. seqAnalysis
293
297
a new object with the selected sequences
294
298
"""
295
299
if isinstance (group , int ):
@@ -391,6 +395,29 @@ def states_colors(self, colors_dict):
391
395
392
396
def cluster_representativeness (self , method = 'freq' , dmax = None , pradius = 0.1 ,
393
397
coverage = 0.25 , nrep = None ):
398
+ """
399
+ Obtain Representative sequences from each cluster
400
+
401
+ Parameters
402
+ ----------
403
+ method: str
404
+ Name of the method used to calculate cluster representatives. Supported methods:
405
+ `freq`, `density`, `dist`
406
+ dmax: float
407
+ Maximum theoretical distance
408
+ pradius: float
409
+ neighborhood radius as a percentage of the maximum ditance dmax. Default is 0.1
410
+ coverage: float
411
+ Size of the representative set, i.e the proportion of objects having a
412
+ representative in their neighborhood
413
+ nrep: int
414
+ Number of representatives. If None, coverage argument is used
415
+
416
+ Returns
417
+ -------
418
+ dict
419
+ Dictionary where keys are the cluster labels and values are the representative sequences
420
+ """
394
421
395
422
clusters = set (self .labels )
396
423
clus_rep = {}
@@ -411,12 +438,22 @@ def seq_representativeness(diss, method='freq', dmax=None, pradius=0.1,
411
438
Parameters
412
439
----------
413
440
method: str
414
- Method used to obtain the representativeness of the sequences in each cluster
441
+ Name of the method used to calculate cluster representatives. Supported methods:
442
+ `freq`, `density`, `dist`
415
443
dmax: float
416
444
Maximum theoretical distance
445
+ pradius: float
446
+ neighborhood radius as a percentage of the maximum ditance dmax. Default is 0.1
447
+ coverage: float
448
+ Size of the representative set, i.e the proportion of objects having a
449
+ representative in their neighborhood
450
+ nrep: int
451
+ Number of representatives. If None, coverage argument is used
417
452
418
453
Returns
419
454
-------
455
+ np.ndarray
456
+ Indices of the representative sequences
420
457
421
458
"""
422
459
n_seq = diss .shape [0 ]
@@ -488,9 +525,11 @@ def seq_representativeness(diss, method='freq', dmax=None, pradius=0.1,
488
525
489
526
def plot_sequences (self , type_fig = 'modal' , plot_all = False , title = '' , dir_path = '' , sort_seq = None ):
490
527
"""
491
- Function to plot three different figures of the sequences.
492
- The modal figure takes the mode state at each time and plots
493
- the percentage of that state compated to all the other states.
528
+ Plot sequences. A subplot is generated for each cluster.
529
+
530
+ Three types of plots can be generated: `modal`, `trajectories`, and `entropy`.
531
+ The modal figure takes the mode state at each time point and plots
532
+ the percentage of that state relative to all the other states.
494
533
495
534
The trajectories figure plots each of the sequences.
496
535
@@ -525,14 +564,21 @@ def plot_sequences(self, type_fig='modal', plot_all=False, title='', dir_path=''
525
564
def hdbscan_clustering (self , min_cluster_size = 50 , min_samples = 5 ,
526
565
alpha = 1.0 , cluster_selection_method = 'eom' , ** kwargs ):
527
566
"""
567
+ Perform HDBSCAN clustering. For more information see `HDBSCAN <https://hdbscan.readthedocs.io/en/latest/index.html>`_
528
568
529
569
Parameters
530
570
----------
531
- min_cluster_size
532
- min_samples
533
- alpha
534
- cluster_selection_method
535
- kwargs
571
+ min_cluster_size : int, optional(default=5)
572
+ The minimum size of clusters
573
+ min_samples : int, optional(default=None)
574
+ The number of samples in a neighborhood for a point to
575
+ be considered a core point
576
+ alpha : float, optional(default=1.0)
577
+ A distance scaling parameter as used in robust single linkage
578
+ cluster_selection_method : str, optional (default='eom')
579
+ The method used to select clusters from the condensed tree
580
+ kwargs : dict
581
+ Extra arguments passed to the hdbscan clustering function
536
582
537
583
Returns
538
584
-------
@@ -552,6 +598,7 @@ def hdbscan_clustering(self, min_cluster_size=50, min_samples=5,
552
598
553
599
def Kmedoids (self , n_clusters ):
554
600
"""
601
+ Perform kmedoids clustering.
555
602
556
603
Parameters
557
604
----------
@@ -573,22 +620,60 @@ def Kmedoids(self, n_clusters):
573
620
return
574
621
575
622
def agglomerative_clustering (self , n_clusters , linkage = 'average' , ** kwargs ):
623
+ """
624
+ Perform agglomerative clustering. For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html>`_
625
+
626
+ Parameters
627
+ ----------
628
+ n_clusters : int
629
+ The number of clusters to find
630
+ linkage : {‘ward’, ‘complete’, ‘average’, ‘single’}, default=’ward’
631
+ Linkage criterion to use
632
+ kwargs : dict
633
+ Other arguments to pass to the agglomerative clustering function
634
+
635
+ Returns
636
+ -------
637
+
638
+ """
576
639
ac = cluster .AgglomerativeClustering (n_clusters = n_clusters , affinity = 'precomputed' ,
577
640
linkage = linkage , ** kwargs ).fit (self .diss )
578
641
self ._labels = ac .labels_
579
642
self ._cluster_method = 'agglomerative'
580
643
return
581
644
582
645
def spectral_clustering (self , n_clusters , random_state = None , num_processors = 1 , ** kwargs ):
646
+ """
647
+ Perform spectral clustering. For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html>`_
648
+
649
+ Parameters
650
+ ----------
651
+ n_clusters :
652
+ The dimension of the projection subspace
653
+ random_state : int, RandomState instance, default=None
654
+ A pseudo random number generator used for the initialization of the
655
+ lobpcg eigen vectors decomposition when eigen_solver='amg' and by the
656
+ K-Means initialization
657
+ num_processors : int, default=1
658
+ The number of parallel jobs to run
659
+ kwargs : dict
660
+ Other arguments to pass to the spectral clustering function
661
+
662
+ Returns
663
+ -------
664
+
665
+ """
583
666
gamma = 1. / len (self .diss [0 ])
584
667
kernel = np .exp (- self .diss * gamma )
585
668
sc = cluster .SpectralClustering (n_clusters = n_clusters , random_state = random_state ,
586
669
affinity = 'precomputed' , n_jobs = num_processors , ** kwargs ).fit (kernel )
587
670
self ._labels = sc .labels_
588
671
self ._cluster_method = 'spectral'
672
+ return
589
673
590
674
def silhouette_score (self ):
591
675
"""
676
+ Compute the mean Silhouette Coefficient of all samples. For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html>`_
592
677
593
678
Returns
594
679
-------
@@ -610,6 +695,26 @@ def silhouette_score(self):
610
695
return score
611
696
612
697
def silhouette_score_spectral_range (self , cluster_range , num_processors = 1 , random_state = None , ** kwargs ):
698
+ """
699
+ Calculate silhouette score for a range of cluster numbers using spectral clustering.
700
+ For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html>`_
701
+
702
+ Parameters
703
+ ----------
704
+ cluster_range: int or array-like
705
+ Range of cluster numbers
706
+ num_processors : int, default=1
707
+ The number of parallel jobs to run
708
+ random_state : int, RandomState instance or None, default=None
709
+ Determines random number generation for selecting a subset of samples.
710
+ kwargs : dict
711
+ Other parameters passed to the silhouette score function
712
+
713
+ Returns
714
+ -------
715
+ pd.DataFrame
716
+ Dataframe with the silhouette score for each cluster number
717
+ """
613
718
if isinstance (cluster_range , int ):
614
719
cluster_range = list (range (2 , cluster_range + 1 )) # +1 to cluster up to cluster_range
615
720
elif hasattr (cluster_range , "__len__" ) and not isinstance (cluster_range , str ):
@@ -631,20 +736,24 @@ def silhouette_score_spectral_range(self, cluster_range, num_processors=1, rando
631
736
def silhouette_score_agglomerative_range (self , cluster_range , linkage = 'average' ,
632
737
num_processors = 1 , ** kwargs ):
633
738
"""
739
+ Calculate silhouette score for a range of cluster numbers using agglomerative clustering.
740
+ For more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html>`_
634
741
635
742
Parameters
636
743
----------
637
744
cluster_range : list-like or int
638
- Range of the number of clusterings to obtain the silhouette score
745
+ Range of cluster numbers
639
746
linkage : str
640
747
Type of agglomerative linkage
641
748
num_processors : int
642
- Number of cores to use
643
- kwargs : key arguments to pass to the aggomerative clustering function
749
+ The number of parallel jobs to run
750
+ kwargs : dict
751
+ Other arguments to pass to the aggomerative clustering function
644
752
645
753
Returns
646
754
-------
647
-
755
+ pd.DataFrame
756
+ Dataframe with the silhouette score for each cluster number
648
757
"""
649
758
650
759
if isinstance (cluster_range , int ):
@@ -669,6 +778,15 @@ def silhouette_score_agglomerative_range(self, cluster_range, linkage='average',
669
778
return clusters_df
670
779
671
780
def calinski_harabaz_score (self ):
781
+ """
782
+ Compute the Calinski and Harabasz score. Fore more information see `sklearn <https://scikit-learn.org/stable/modules/generated/sklearn.metrics.calinski_harabasz_score.html>`_
783
+
784
+ Returns
785
+ -------
786
+ float
787
+ Calinski harabasz score to measure quality of the clustering
788
+
789
+ """
672
790
if self ._labels is None :
673
791
raise Exception ('you must cluster the signatures first' )
674
792
score = metrics .calinski_harabasz_score (self .sequences , self ._labels )
0 commit comments