@@ -29,26 +29,26 @@ def _correlate_with_eigs(eigvecs, phasing_vector, metric="spearmanr"):
2929 Correlation coefficients.
3030 """
3131 corrs = []
32+ # iterate over columns
33+ for eigvec in eigvecs .T :
3234
33- for i in range (eigvecs .shape [1 ]):
34-
35- mask = np .isfinite (eigvecs [:, i ]) & np .isfinite (phasing_vector )
35+ mask = np .isfinite (eigvec ) & np .isfinite (phasing_vector )
3636
3737 if metric is None or metric == "spearmanr" :
38- corr = scipy .stats .spearmanr (phasing_vector [mask ], eigvecs [mask , i ])[0 ]
38+ corr = scipy .stats .spearmanr (phasing_vector [mask ], eigvec [mask ])[0 ]
3939 elif metric == "pearsonr" :
40- corr = scipy .stats .pearsonr (phasing_vector [mask ], eigvecs [mask , i ])[0 ]
40+ corr = scipy .stats .pearsonr (phasing_vector [mask ], eigvec [mask ])[0 ]
4141 elif metric == "var_explained" :
42- corr = scipy .stats .pearsonr (phasing_vector [mask ], eigvecs [mask , i ])[0 ]
42+ corr = scipy .stats .pearsonr (phasing_vector [mask ], eigvec [mask ])[0 ]
4343 # multiply by the sign to keep the phasing information
44- corr = np .sign (corr ) * corr * corr * np .var (eigvecs [mask , i ])
44+ corr = np .sign (corr ) * corr * corr * np .var (eigvec [mask ])
4545 elif metric == "MAD_explained" :
4646 corr = (
47- numutils .COMED (phasing_vector [mask ], eigvecs [mask , i ]) *
48- numutils .MAD (eigvecs [mask , i ])
47+ numutils .COMED (phasing_vector [mask ], eigvec [mask ]) *
48+ numutils .MAD (eigvec [mask ])
4949 )
5050 else :
51- raise ValueError ("Unknown correlation metric: {}" . format ( metric ) )
51+ raise ValueError (f "Unknown correlation metric: { metric } " )
5252
5353 corrs .append (corr )
5454
@@ -214,8 +214,6 @@ def eigs_cis(
214214 else ignore_diags
215215 )
216216
217- bins = clr .bins ()[:]
218-
219217 # Adjust phasing track as necessary
220218 if phasing_track is not None :
221219 phasing_track = align_track_with_cooler (
@@ -224,32 +222,18 @@ def eigs_cis(
224222 view_df = view_df ,
225223 clr_weight_name = clr_weight_name ,
226224 mask_clr_bad_bins = True ,
227- drop_track_na = True # this adds check for chromosomes that have all missing values
225+ drop_track_na = True # this adds check for chromosomes that have all missing values
228226 )
229227
230- # Prepare output table for eigen vectors
231- eigvec_table = bioframe .assign_view (bins , view_df ).dropna (subset = ["view_region" ], axis = 0 )
232- eigvec_table = eigvec_table .loc [:, bins .columns ]
233- eigvec_columns = [f"E{ i + 1 } " for i in range (n_eigs )]
234- for ev_col in eigvec_columns :
235- eigvec_table [ev_col ] = np .nan
236-
237- # Prepare output table for eigenvalues
238- eigvals_table = view_df .copy ()
239- eigval_columns = [f"eigval{ i + 1 } " for i in range (n_eigs )]
240- for eval_col in eigval_columns :
241- eigvals_table [eval_col ] = np .nan
242-
243- # Eigendecompose matrix per region (can be multiprocessed).
244- # Output assumes that the order of results matches regions.
228+ # Eigendecompose matrix per region (can be multiprocessed)
245229 def _each (region ):
246230 _region = region [:3 ] # take only (chrom, start, end)
247231 A = clr .matrix (balance = clr_weight_name ).fetch (_region )
248232
249233 OE = _obsexp_cis_dense (A , ignore_diags , clip_percentile )
250234 if OE is None :
251- eigvals = np .array ([ np .nan for i in range ( n_eigs )] )
252- eigvecs = np .array ([ np . ones ( A . shape [ 0 ]) * np .nan for i in range ( n_eigs )] )
235+ eigvals = np .full ( shape = n_eigs , fill_value = np .nan )
236+ eigvecs = np .full ( shape = ( n_eigs , len ( A )), fill_value = np .nan )
253237 else :
254238 eigvecs , eigvals = numutils .get_eig (OE , n_eigs , mask_zero_rows = True )
255239 eigvecs /= np .sqrt (np .nansum (eigvecs ** 2 , axis = 1 ))[:, None ]
@@ -258,7 +242,7 @@ def _each(region):
258242 if phasing_track is not None :
259243 # Extract phasing track relevant for the _region
260244 phasing_track_region = bioframe .select (phasing_track , _region )
261- phasing_vector = phasing_track_region ["value" ].values
245+ phasing_vector = phasing_track_region ["value" ].to_numpy ()
262246 corrs = _correlate_with_eigs (
263247 eigvecs , phasing_vector , corr_metric
264248 )
@@ -273,12 +257,25 @@ def _each(region):
273257
274258 return _region , eigvals , eigvecs
275259
276- results = map (_each , view_df .values )
260+ # perform eigendecomposition for each region independently
261+ eigdecomp_results = map (_each , view_df .values )
277262
263+ # Prepare output table for eigen vectors by adding columns to bins-table
264+ bins = clr .bins ()[:]
265+ eigvec_columns = [f"E{ i + 1 } " for i in range (n_eigs )]
266+ eigvec_table = bins .reindex (columns = bins .columns [:3 ] + eigvec_columns )
267+ eigvec_table = bioframe .assign_view (eigvec_table , view_df ).dropna (subset = ["view_region" ], axis = 0 )
268+
269+ # Prepare output table for eigenvalues by adding columns to view_df-table
270+ eigval_columns = [f"eigval{ i + 1 } " for i in range (n_eigs )]
271+ eigvals_table = view_df .reindex (columns = view_df .columns + eigval_columns )
272+
278273 # Go through eigendecomposition results and fill in output tables.
279- for _region , _eigvals , _eigvecs in results :
274+ for _region , _eigvals , _eigvecs in eigdecomp_results :
275+ # fill in eigvec_table with the eigenvectors for a given region
280276 idx = bioframe .select (eigvec_table , _region ).index
281277 eigvec_table .loc [idx , eigvec_columns ] = _eigvecs .T
278+ # fill in eigval_table with the eigenvalues for a given region
282279 idx = bioframe .select (eigvals_table , _region ).index
283280 eigvals_table .loc [idx , eigval_columns ] = _eigvals
284281
@@ -482,7 +479,7 @@ def eigs_trans(
482479 mask_clr_bad_bins = True ,
483480 drop_track_na = True # this adds check for chromosomes that have all missing values
484481 )
485- phasing_vector = phasing_track ["value" ].values [lo :hi ]
482+ phasing_vector = phasing_track ["value" ].to_numpy () [lo :hi ]
486483
487484 OE = _obsexp_trans_dense (A , partition , perc_top , perc_bottom )
488485
@@ -503,13 +500,13 @@ def eigs_trans(
503500 eigvals = eigvals [idx ]
504501 eigvecs [idx ] = eigvecs [idx ]
505502
506- eigvec_table = bins .copy ()
507- for i , eigvec in enumerate (eigvecs ):
508- eigvec_table [f"E{ i + 1 } " ] = eigvec
503+ # prepare bins-like table to store eigenvectors and fill it
504+ eigvec_columns = [f"E{ i + 1 } " for i in range (n_eigs )]
505+ eigvec_table = bins .reindex (columns = bins .columns [:3 ] + eigvec_columns )
506+ eigvec_table [eigvec_columns ] = eigvecs .T
509507
510- eigvals_table = pd .DataFrame (
511- data = np .atleast_2d (eigvals ),
512- columns = [f"eigval{ i + 1 } " for i in range (n_eigs )],
513- )
508+ # prepare table (single row) to store eigenvalues in columns and fill it
509+ eigval_columns = [f"eigval{ i + 1 } " for i in range (n_eigs )]
510+ eigvals_table = pd .DataFrame ( np .atleast_2d (eigvals ), columns = eigval_columns )
514511
515512 return eigvals_table , eigvec_table
0 commit comments