Skip to content

Commit 0c2d217

Browse files
authored
tweak eigdecomp.py
few tweaks and reorderings in eigdecomp to address my comments in open2c#397
1 parent 98c19b0 commit 0c2d217

File tree

1 file changed

+38
-41
lines changed

1 file changed

+38
-41
lines changed

cooltools/api/eigdecomp.py

Lines changed: 38 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,26 @@ def _correlate_with_eigs(eigvecs, phasing_vector, metric="spearmanr"):
2929
Correlation coefficients.
3030
"""
3131
corrs = []
32+
# iterate over columns
33+
for eigvec in eigvecs.T:
3234

33-
for i in range(eigvecs.shape[1]):
34-
35-
mask = np.isfinite(eigvecs[:, i]) & np.isfinite(phasing_vector)
35+
mask = np.isfinite(eigvec) & np.isfinite(phasing_vector)
3636

3737
if metric is None or metric == "spearmanr":
38-
corr = scipy.stats.spearmanr(phasing_vector[mask], eigvecs[mask, i])[0]
38+
corr = scipy.stats.spearmanr(phasing_vector[mask], eigvec[mask])[0]
3939
elif metric == "pearsonr":
40-
corr = scipy.stats.pearsonr(phasing_vector[mask], eigvecs[mask, i])[0]
40+
corr = scipy.stats.pearsonr(phasing_vector[mask], eigvec[mask])[0]
4141
elif metric == "var_explained":
42-
corr = scipy.stats.pearsonr(phasing_vector[mask], eigvecs[mask, i])[0]
42+
corr = scipy.stats.pearsonr(phasing_vector[mask], eigvec[mask])[0]
4343
# multiply by the sign to keep the phasing information
44-
corr = np.sign(corr) * corr * corr * np.var(eigvecs[mask, i])
44+
corr = np.sign(corr) * corr * corr * np.var(eigvec[mask])
4545
elif metric == "MAD_explained":
4646
corr = (
47-
numutils.COMED(phasing_vector[mask], eigvecs[mask, i]) *
48-
numutils.MAD(eigvecs[mask, i])
47+
numutils.COMED(phasing_vector[mask], eigvec[mask]) *
48+
numutils.MAD(eigvec[mask])
4949
)
5050
else:
51-
raise ValueError("Unknown correlation metric: {}".format(metric))
51+
raise ValueError(f"Unknown correlation metric: {metric}")
5252

5353
corrs.append(corr)
5454

@@ -214,8 +214,6 @@ def eigs_cis(
214214
else ignore_diags
215215
)
216216

217-
bins = clr.bins()[:]
218-
219217
# Adjust phasing track as necessary
220218
if phasing_track is not None:
221219
phasing_track = align_track_with_cooler(
@@ -224,32 +222,18 @@ def eigs_cis(
224222
view_df=view_df,
225223
clr_weight_name=clr_weight_name,
226224
mask_clr_bad_bins=True,
227-
drop_track_na=True # this adds check for chromosomes that have all missing values
225+
drop_track_na=True # this adds check for chromosomes that have all missing values
228226
)
229227

230-
# Prepare output table for eigen vectors
231-
eigvec_table = bioframe.assign_view(bins, view_df).dropna(subset=["view_region"], axis=0)
232-
eigvec_table = eigvec_table.loc[:, bins.columns]
233-
eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
234-
for ev_col in eigvec_columns:
235-
eigvec_table[ev_col] = np.nan
236-
237-
# Prepare output table for eigenvalues
238-
eigvals_table = view_df.copy()
239-
eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
240-
for eval_col in eigval_columns:
241-
eigvals_table[eval_col] = np.nan
242-
243-
# Eigendecompose matrix per region (can be multiprocessed).
244-
# Output assumes that the order of results matches regions.
228+
# Eigendecompose matrix per region (can be multiprocessed)
245229
def _each(region):
246230
_region = region[:3] # take only (chrom, start, end)
247231
A = clr.matrix(balance=clr_weight_name).fetch(_region)
248232

249233
OE = _obsexp_cis_dense(A, ignore_diags, clip_percentile)
250234
if OE is None:
251-
eigvals = np.array([np.nan for i in range(n_eigs)])
252-
eigvecs = np.array([np.ones(A.shape[0]) * np.nan for i in range(n_eigs)])
235+
eigvals = np.full(shape=n_eigs, fill_value=np.nan)
236+
eigvecs = np.full(shape=(n_eigs, len(A)), fill_value=np.nan)
253237
else:
254238
eigvecs, eigvals = numutils.get_eig(OE, n_eigs, mask_zero_rows=True)
255239
eigvecs /= np.sqrt(np.nansum(eigvecs ** 2, axis=1))[:, None]
@@ -258,7 +242,7 @@ def _each(region):
258242
if phasing_track is not None:
259243
# Extract phasing track relevant for the _region
260244
phasing_track_region = bioframe.select(phasing_track, _region)
261-
phasing_vector = phasing_track_region["value"].values
245+
phasing_vector = phasing_track_region["value"].to_numpy()
262246
corrs = _correlate_with_eigs(
263247
eigvecs, phasing_vector, corr_metric
264248
)
@@ -273,12 +257,25 @@ def _each(region):
273257

274258
return _region, eigvals, eigvecs
275259

276-
results = map(_each, view_df.values)
260+
# perform eigendecomposition for each region independently
261+
eigdecomp_results = map(_each, view_df.values)
277262

263+
# Prepare output table for eigen vectors by adding columns to bins-table
264+
bins = clr.bins()[:]
265+
eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
266+
eigvec_table = bins.reindex(columns=bins.columns[:3] + eigvec_columns)
267+
eigvec_table = bioframe.assign_view(eigvec_table, view_df).dropna(subset=["view_region"], axis=0)
268+
269+
# Prepare output table for eigenvalues by adding columns to view_df-table
270+
eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
271+
eigvals_table = view_df.reindex(columns=view_df.columns + eigval_columns)
272+
278273
# Go through eigendecomposition results and fill in output tables.
279-
for _region, _eigvals, _eigvecs in results:
274+
for _region, _eigvals, _eigvecs in eigdecomp_results:
275+
# fill in eigvec_table with the eigenvectors for a given region
280276
idx = bioframe.select(eigvec_table, _region).index
281277
eigvec_table.loc[idx, eigvec_columns] = _eigvecs.T
278+
# fill in eigval_table with the eigenvalues for a given region
282279
idx = bioframe.select(eigvals_table, _region).index
283280
eigvals_table.loc[idx, eigval_columns] = _eigvals
284281

@@ -482,7 +479,7 @@ def eigs_trans(
482479
mask_clr_bad_bins=True,
483480
drop_track_na=True # this adds check for chromosomes that have all missing values
484481
)
485-
phasing_vector = phasing_track["value"].values[lo:hi]
482+
phasing_vector = phasing_track["value"].to_numpy()[lo:hi]
486483

487484
OE = _obsexp_trans_dense(A, partition, perc_top, perc_bottom)
488485

@@ -503,13 +500,13 @@ def eigs_trans(
503500
eigvals = eigvals[idx]
504501
eigvecs[idx] = eigvecs[idx]
505502

506-
eigvec_table = bins.copy()
507-
for i, eigvec in enumerate(eigvecs):
508-
eigvec_table[f"E{i + 1}"] = eigvec
503+
# prepare bins-like table to store eigenvectors and fill it
504+
eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
505+
eigvec_table = bins.reindex(columns=bins.columns[:3] + eigvec_columns)
506+
eigvec_table[eigvec_columns] = eigvecs.T
509507

510-
eigvals_table = pd.DataFrame(
511-
data=np.atleast_2d(eigvals),
512-
columns=[f"eigval{i + 1}" for i in range(n_eigs)],
513-
)
508+
# prepare table (single row) to store eigenvalues in columns and fill it
509+
eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
510+
eigvals_table = pd.DataFrame( np.atleast_2d(eigvals), columns=eigval_columns )
514511

515512
return eigvals_table, eigvec_table

0 commit comments

Comments
 (0)