From 2cc4f071aa09488b6a79a3ffe9424f9c19f1ef1e Mon Sep 17 00:00:00 2001 From: Emma Dann Date: Sun, 7 May 2023 21:52:24 +0000 Subject: [PATCH 1/6] added option for strand-aware expansion --- bioframe/ops.py | 27 +++++++++++++++++++++++--- tests/test_ops.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/bioframe/ops.py b/bioframe/ops.py index b7cbcca..ec7ff7c 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -151,7 +151,7 @@ def select(df, region, cols=None): return df.loc[select_mask(df, region, cols)] -def expand(df, pad=None, scale=None, side="both", cols=None): +def expand(df, pad=None, scale=None, side="both", cols=None, strand_aware=False): """ Expand each interval by an amount specified with `pad`. @@ -181,6 +181,10 @@ def expand(df, pad=None, scale=None, side="both", cols=None): cols : (str, str, str) or None The names of columns containing the chromosome, start and end of the genomic intervals. Default values are 'chrom', 'start', 'end'. + + strand_aware: bool + If True, the left or right expansions are made considering strand information + Default False Returns ------- @@ -194,6 +198,12 @@ def expand(df, pad=None, scale=None, side="both", cols=None): ck, sk, ek = _get_default_colnames() if cols is None else cols checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) + if strand_aware: + if not 'strand' in df.columns: + raise ValueError('strand column is missing - strand-aware expansion is not possible') + if not df.strand.isin(['+', '-']).all(): + missing_strand = (~df.strand.isin(['+', '-'])).sum() + raise ValueError(f'strand information missing for {missing_strand}/{df.shape[0]} ranges - strand-aware expansion is not possible') if scale is not None and pad is not None: raise ValueError("only one of pad or scale can be supplied") @@ -210,10 +220,21 @@ def expand(df, pad=None, scale=None, side="both", cols=None): raise ValueError("either pad or scale must be supplied") df_expanded = df.copy() - if side == "both" or side == "left": + if side == 'both': df_expanded[sk] = df[sk].values - pads - if side == "both" or side == "right": df_expanded[ek] = df[ek] + pads + if side == "left": + if strand_aware: + df_expanded[sk] = np.where(df["strand"] == '+', df[sk] - pads, df[sk]) + df_expanded[ek] = np.where(df["strand"] == '+', df[ek], df[ek] + pads) + else: + df_expanded[sk] = df[sk].values - pads + if side == "right": + if strand_aware: + df_expanded[sk] = np.where(df["strand"] == '+', df[sk], df[sk] - pads) + df_expanded[ek] = np.where(df["strand"] == '+', df[ek] + pads, df[ek]) + else: + df_expanded[ek] = df[ek] + pads if pad is not None: if pad < 0: diff --git a/tests/test_ops.py b/tests/test_ops.py index 95f4e06..4c34f5e 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -297,6 +297,54 @@ def test_expand_amount_args(): df = pd.read_csv(StringIO(d), sep=r"\s+") with pytest.raises(ValueError): bioframe.expand(df, pad=10, scale=2.0) + +def test_expand_strand_aware(): + df_test = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "-"], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + df_test_expanded_right = bioframe.expand(df_test, pad=100, side='right', strand_aware=True) + df_test_expanded_left = bioframe.expand(df_test, pad=100, side='left', strand_aware=True) + + df_right = pd.DataFrame( + [ + ["chr1", 1000, 1300, "+"], + ["chr1", 700, 1200, "-"], + ["chrX", 1000, 1600, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + df_left = pd.DataFrame( + [ + ["chr1", 900, 1200, "+"], + ["chr1", 800, 1300, "-"], + ["chrX", 900, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + + pd.testing.assert_frame_equal(df_right, df_test_expanded_right) + pd.testing.assert_frame_equal(df_left, df_test_expanded_left) + + # Test strand information is correct + df_test = pd.DataFrame( + [ + ["chr1", 1000, 1200, "."], + ["chr1", 800, 1200, "-"], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + with pytest.raises(ValueError): + bioframe.expand(df_test, pad=100, side='right', strand_aware=True) + + df_test.drop('strand', axis=1, inplace=True) + with pytest.raises(ValueError): + bioframe.expand(df_test, pad=100, side='right', strand_aware=True) def test_overlap(): From de66e90d13cbbaffd47c4100c0296a798268f501 Mon Sep 17 00:00:00 2001 From: Emma Dann Date: Sun, 7 May 2023 22:05:39 +0000 Subject: [PATCH 2/6] uniformed notation w closest --- bioframe/ops.py | 30 +++++++++++++++--------------- tests/test_ops.py | 10 +++++----- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/bioframe/ops.py b/bioframe/ops.py index ec7ff7c..17d9e17 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -151,7 +151,7 @@ def select(df, region, cols=None): return df.loc[select_mask(df, region, cols)] -def expand(df, pad=None, scale=None, side="both", cols=None, strand_aware=False): +def expand(df, pad=None, scale=None, side="both", cols=None, direction_col=None): """ Expand each interval by an amount specified with `pad`. @@ -182,9 +182,9 @@ def expand(df, pad=None, scale=None, side="both", cols=None, strand_aware=False) The names of columns containing the chromosome, start and end of the genomic intervals. Default values are 'chrom', 'start', 'end'. - strand_aware: bool - If True, the left or right expansions are made considering strand information - Default False + direction_col: str or None + Name of direction column that will set upstream/downstream orientation for each feature. + The column should contain bioframe-compliant strand ("+", "-", "."). Returns ------- @@ -198,11 +198,11 @@ def expand(df, pad=None, scale=None, side="both", cols=None, strand_aware=False) ck, sk, ek = _get_default_colnames() if cols is None else cols checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) - if strand_aware: - if not 'strand' in df.columns: - raise ValueError('strand column is missing - strand-aware expansion is not possible') - if not df.strand.isin(['+', '-']).all(): - missing_strand = (~df.strand.isin(['+', '-'])).sum() + if direction_col is not None: + if not direction_col in df.columns: + raise ValueError(f'{direction_col} column is missing - strand-aware expansion is not possible') + if not df.strand.isin(['+', '-', '.']).all(): + missing_strand = (~df[direction_col].isin(['+', '-', '.'])).sum() raise ValueError(f'strand information missing for {missing_strand}/{df.shape[0]} ranges - strand-aware expansion is not possible') if scale is not None and pad is not None: @@ -224,15 +224,15 @@ def expand(df, pad=None, scale=None, side="both", cols=None, strand_aware=False) df_expanded[sk] = df[sk].values - pads df_expanded[ek] = df[ek] + pads if side == "left": - if strand_aware: - df_expanded[sk] = np.where(df["strand"] == '+', df[sk] - pads, df[sk]) - df_expanded[ek] = np.where(df["strand"] == '+', df[ek], df[ek] + pads) + if direction_col is not None: + df_expanded[sk] = np.where(df[direction_col] == '-', df[sk] , df[sk] - pads) + df_expanded[ek] = np.where(df[direction_col] == '-', df[ek] + pads, df[ek] ) else: df_expanded[sk] = df[sk].values - pads if side == "right": - if strand_aware: - df_expanded[sk] = np.where(df["strand"] == '+', df[sk], df[sk] - pads) - df_expanded[ek] = np.where(df["strand"] == '+', df[ek] + pads, df[ek]) + if direction_col is not None: + df_expanded[sk] = np.where(df[direction_col] == '-', df[sk] - pads, df[sk] ) + df_expanded[ek] = np.where(df[direction_col] == '-', df[ek] , df[ek] + pads) else: df_expanded[ek] = df[ek] + pads diff --git a/tests/test_ops.py b/tests/test_ops.py index 4c34f5e..17c5e3e 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -307,8 +307,8 @@ def test_expand_strand_aware(): ], columns=["chrom", "start", "end", "strand"], ) - df_test_expanded_right = bioframe.expand(df_test, pad=100, side='right', strand_aware=True) - df_test_expanded_left = bioframe.expand(df_test, pad=100, side='left', strand_aware=True) + df_test_expanded_right = bioframe.expand(df_test, pad=100, side='right', direction_col='strand') + df_test_expanded_left = bioframe.expand(df_test, pad=100, side='left', direction_col='strand') df_right = pd.DataFrame( [ @@ -333,18 +333,18 @@ def test_expand_strand_aware(): # Test strand information is correct df_test = pd.DataFrame( [ - ["chr1", 1000, 1200, "."], + ["chr1", 1000, 1200, "x"], ["chr1", 800, 1200, "-"], ["chrX", 1000, 1500, "+"], ], columns=["chrom", "start", "end", "strand"], ) with pytest.raises(ValueError): - bioframe.expand(df_test, pad=100, side='right', strand_aware=True) + bioframe.expand(df_test, pad=100, side='right', direction_col='strand') df_test.drop('strand', axis=1, inplace=True) with pytest.raises(ValueError): - bioframe.expand(df_test, pad=100, side='right', strand_aware=True) + bioframe.expand(df_test, pad=100, side='right', direction_col='strand') def test_overlap(): From c3e20220855e0a8964b3f95de3bf5f6c1a59226c Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sun, 7 Apr 2024 10:57:14 -0400 Subject: [PATCH 3/6] Implement stand-awareness through new function --- bioframe/__init__.py | 2 + bioframe/ops.py | 129 +++++++++++++++++++++++------ tests/test_ops.py | 190 ++++++++++++++++++++++++++++++++----------- 3 files changed, 247 insertions(+), 74 deletions(-) diff --git a/bioframe/__init__.py b/bioframe/__init__.py index abc2ea3..1de445e 100644 --- a/bioframe/__init__.py +++ b/bioframe/__init__.py @@ -68,6 +68,7 @@ "select_labels", "select_mask", "setdiff", + "shift", "sort_bedframe", "subtract", "trim", @@ -141,6 +142,7 @@ select_labels, select_mask, setdiff, + shift, sort_bedframe, subtract, trim, diff --git a/bioframe/ops.py b/bioframe/ops.py index 26a51c2..54e4216 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pandas as pd @@ -18,6 +20,7 @@ "closest", "subtract", "setdiff", + "shift", "count_overlaps", "trim", "complement", @@ -147,7 +150,104 @@ def select(df, region, cols=None): return df.loc[select_mask(df, region, cols)] -def expand(df, pad=None, scale=None, side="both", cols=None, direction_col=None): +def shift(df, amount, along=None, drop_invalid=False, cols=None): + """ + Translate the bounds of each genomic interval. + + Different shift amounts can be applied to leading and trailing bounds, and + can be applied in a strand-aware manner. Negative values indicate a shift + leftwards or upstream. + + Parameters + ---------- + df : pandas.DataFrame + + amount : int, array-like, or pair of int or array-like, optional + The amount(s) by which the bounds are linearly shifted. If a pair + ``(x, y)``, shift the leading bound by ``x`` and the trailing bound by + ``y``. Negative and positive values shift in the upstream and + downstream directions, respectively. Features are taken to assume the + reference orientation unless ``along`` is specified. + + along: str, array-like, or None + Name of column that will set up/downstream orientation for each + feature. The column should contain compliant strand values + ("+", "-", "."). + + cols : (str, str, str) or None + The names of columns containing the chromosome, start and end of the + genomic intervals. Default values are 'chrom', 'start', 'end'. + + Returns + ------- + pandas.DataFrame + + Notes + ----- + See :func:`bioframe.trim` for trimming interals after expansion or shift. + """ + ck, sk, ek = _get_default_colnames() if cols is None else cols + checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) + + if along is not None: + if not along in df.columns: + raise ValueError( + f'Cannot do strand-aware operation: {along} column is missing.' + ) + if not df[along].isin(['+', '-', '.']).all(): + missing_strand = (~df[along].isin(['+', '-', '.'])).sum() + raise ValueError( + 'Cannot do strand-aware operation: strand information missing ' + f'for {missing_strand}/{df.shape[0]} ranges.' + ) + + if not isinstance(amount, (list, tuple)): + amount = (amount, amount) + elif len(amount) != 2: + raise ValueError( + "`amount` should be a single object or a sequence of length 2; " + f"got length {len(amount)}." + ) + + out = df.copy() + if along is None: + out[sk] = df[sk] + amount[0] + out[ek] = df[ek] + amount[1] + else: + out[sk] = np.where( + df[along] == '+', + df[sk] + amount[0], + np.where( + df[along] == '-', + df[sk] - amount[1], + df[sk] + ) + ) + out[ek] = np.where( + df[along] == '+', + df[ek] + amount[1], + np.where( + df[along] == '-', + df[ek] - amount[0], + df[ek] + ) + ) + + is_neglen = (out[ek] - out[sk]) < 0 + if is_neglen.any(): + if drop_invalid: + out = out.loc[~is_neglen] + else: + warnings.warn( + f"Operation produced {is_neglen.sum()}/{out.shape[0]} " + "intervals with negative length." + ) + + return out + + +def expand(df, pad=None, scale=None, side="both", cols=None): + """ Expand each interval by an amount specified with `pad`. @@ -177,10 +277,6 @@ def expand(df, pad=None, scale=None, side="both", cols=None, direction_col=None) cols : (str, str, str) or None The names of columns containing the chromosome, start and end of the genomic intervals. Default values are 'chrom', 'start', 'end'. - - direction_col: str or None - Name of direction column that will set upstream/downstream orientation for each feature. - The column should contain bioframe-compliant strand ("+", "-", "."). Returns ------- @@ -189,17 +285,9 @@ def expand(df, pad=None, scale=None, side="both", cols=None, direction_col=None) Notes ----- See :func:`bioframe.trim` for trimming interals after expansion. - """ - ck, sk, ek = _get_default_colnames() if cols is None else cols checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) - if direction_col is not None: - if not direction_col in df.columns: - raise ValueError(f'{direction_col} column is missing - strand-aware expansion is not possible') - if not df.strand.isin(['+', '-', '.']).all(): - missing_strand = (~df[direction_col].isin(['+', '-', '.'])).sum() - raise ValueError(f'strand information missing for {missing_strand}/{df.shape[0]} ranges - strand-aware expansion is not possible') if scale is not None and pad is not None: raise ValueError("only one of pad or scale can be supplied") @@ -216,21 +304,10 @@ def expand(df, pad=None, scale=None, side="both", cols=None, direction_col=None) raise ValueError("either pad or scale must be supplied") df_expanded = df.copy() - if side == 'both': + if side == "both" or side == "left": df_expanded[sk] = df[sk].values - pads + if side == "both" or side == "right": df_expanded[ek] = df[ek] + pads - if side == "left": - if direction_col is not None: - df_expanded[sk] = np.where(df[direction_col] == '-', df[sk] , df[sk] - pads) - df_expanded[ek] = np.where(df[direction_col] == '-', df[ek] + pads, df[ek] ) - else: - df_expanded[sk] = df[sk].values - pads - if side == "right": - if direction_col is not None: - df_expanded[sk] = np.where(df[direction_col] == '-', df[sk] - pads, df[sk] ) - df_expanded[ek] = np.where(df[direction_col] == '-', df[ek] , df[ek] + pads) - else: - df_expanded[ek] = df[ek] + pads if pad is not None: if pad < 0: diff --git a/tests/test_ops.py b/tests/test_ops.py index f3931fe..c59b75d 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -214,6 +214,148 @@ def test_trim(): ) +def test_shift(): + df = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "-"], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, 10), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 + 10, "+"], + ["chr1", 800 + 10, 1200 + 10, "-"], + ["chrX", 1000 + 10, 1500 + 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, -10), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 - 10, "+"], + ["chr1", 800 - 10, 1200 - 10, "-"], + ["chrX", 1000 - 10, 1500 - 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (-10, 20)), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 + 20, "+"], + ["chr1", 800 - 10, 1200 + 20, "-"], + ["chrX", 1000 - 10, 1500 + 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -20)), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 - 20, "+"], + ["chr1", 800 + 10, 1200 - 20, "-"], + ["chrX", 1000 + 10, 1500 - 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -200), drop_invalid=True), + pd.DataFrame( + [ + ["chr1", 800 + 10, 1200 - 200, "-"], + ["chrX", 1000 + 10, 1500 - 200, "+"], + ], + columns=["chrom", "start", "end", "strand"], + index=[1, 2], + ) + ) + + +def test_shift_strand_aware(): + df = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "-"], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, 10, along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 + 10, "+"], + ["chr1", 800 - 10, 1200 - 10, "-"], + ["chrX", 1000 + 10, 1500 + 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, -10, along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 - 10, "+"], + ["chr1", 800 + 10, 1200 + 10, "-"], + ["chrX", 1000 - 10, 1500 - 10, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (-10, 20), along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 - 10, 1200 + 20, "+"], + ["chr1", 800 - 20, 1200 + 10, "-"], + ["chrX", 1000 - 10, 1500 + 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -20), along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 - 20, "+"], + ["chr1", 800 + 20, 1200 - 10, "-"], + ["chrX", 1000 + 10, 1500 - 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -200), along="strand", drop_invalid=True), + pd.DataFrame( + [ + ["chr1", 800 + 200, 1200 - 10, "-"], + ["chrX", 1000 + 10, 1500 - 200, "+"], + ], + columns=["chrom", "start", "end", "strand"], + index=[1, 2], + ) + ) + + def test_expand(): d = """chrom start end 0 chr1 1 5 @@ -296,54 +438,6 @@ def test_expand_amount_args(): df = pd.read_csv(StringIO(d), sep=r"\s+") with pytest.raises(ValueError): bioframe.expand(df, pad=10, scale=2.0) - -def test_expand_strand_aware(): - df_test = pd.DataFrame( - [ - ["chr1", 1000, 1200, "+"], - ["chr1", 800, 1200, "-"], - ["chrX", 1000, 1500, "+"], - ], - columns=["chrom", "start", "end", "strand"], - ) - df_test_expanded_right = bioframe.expand(df_test, pad=100, side='right', direction_col='strand') - df_test_expanded_left = bioframe.expand(df_test, pad=100, side='left', direction_col='strand') - - df_right = pd.DataFrame( - [ - ["chr1", 1000, 1300, "+"], - ["chr1", 700, 1200, "-"], - ["chrX", 1000, 1600, "+"], - ], - columns=["chrom", "start", "end", "strand"], - ) - df_left = pd.DataFrame( - [ - ["chr1", 900, 1200, "+"], - ["chr1", 800, 1300, "-"], - ["chrX", 900, 1500, "+"], - ], - columns=["chrom", "start", "end", "strand"], - ) - - pd.testing.assert_frame_equal(df_right, df_test_expanded_right) - pd.testing.assert_frame_equal(df_left, df_test_expanded_left) - - # Test strand information is correct - df_test = pd.DataFrame( - [ - ["chr1", 1000, 1200, "x"], - ["chr1", 800, 1200, "-"], - ["chrX", 1000, 1500, "+"], - ], - columns=["chrom", "start", "end", "strand"], - ) - with pytest.raises(ValueError): - bioframe.expand(df_test, pad=100, side='right', direction_col='strand') - - df_test.drop('strand', axis=1, inplace=True) - with pytest.raises(ValueError): - bioframe.expand(df_test, pad=100, side='right', direction_col='strand') def test_overlap(): From 174674034fdcfafaa076bbb547a4df36e2c09057 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sun, 7 Apr 2024 11:10:02 -0400 Subject: [PATCH 4/6] Fix linting error --- bioframe/ops.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bioframe/ops.py b/bioframe/ops.py index 54e4216..b83a85c 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -174,6 +174,10 @@ def shift(df, amount, along=None, drop_invalid=False, cols=None): feature. The column should contain compliant strand values ("+", "-", "."). + drop_invalid: bool, optional [default: False] + Remove any intervals having negative length after shifting bounds. + By default, they will not be removed but a warning will be raised. + cols : (str, str, str) or None The names of columns containing the chromosome, start and end of the genomic intervals. Default values are 'chrom', 'start', 'end'. @@ -190,7 +194,7 @@ def shift(df, amount, along=None, drop_invalid=False, cols=None): checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) if along is not None: - if not along in df.columns: + if along not in df.columns: raise ValueError( f'Cannot do strand-aware operation: {along} column is missing.' ) From 13d04b36e95cc4f6033d91c17828bfe5ef35e22f Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sun, 7 Apr 2024 11:48:22 -0400 Subject: [PATCH 5/6] tests: Add test for ignoring unstranded features --- tests/test_ops.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/tests/test_ops.py b/tests/test_ops.py index c59b75d..bf71c0e 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -285,7 +285,7 @@ def test_shift(): ) -def test_shift_strand_aware(): +def test_shift_strandaware(): df = pd.DataFrame( [ ["chr1", 1000, 1200, "+"], @@ -356,6 +356,28 @@ def test_shift_strand_aware(): ) +def test_shift_strandaware_unstranded(): + df = pd.DataFrame( + [ + ["chr1", 1000, 1200, "+"], + ["chr1", 800, 1200, "."], + ["chrX", 1000, 1500, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + pd.testing.assert_frame_equal( + bioframe.shift(df, (10, -20), along="strand"), + pd.DataFrame( + [ + ["chr1", 1000 + 10, 1200 - 20, "+"], + ["chr1", 800, 1200, "."], + ["chrX", 1000 + 10, 1500 - 20, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ) + ) + + def test_expand(): d = """chrom start end 0 chr1 1 5 From e3443076a37233145958db7e67063b85bb839a9e Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sun, 7 Apr 2024 11:48:57 -0400 Subject: [PATCH 6/6] feat: Support array-like along argument --- bioframe/ops.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/bioframe/ops.py b/bioframe/ops.py index b83a85c..5b5fe17 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -172,7 +172,7 @@ def shift(df, amount, along=None, drop_invalid=False, cols=None): along: str, array-like, or None Name of column that will set up/downstream orientation for each feature. The column should contain compliant strand values - ("+", "-", "."). + ("+", "-", "."). Unstranded features will be ignored. drop_invalid: bool, optional [default: False] Remove any intervals having negative length after shifting bounds. @@ -194,19 +194,24 @@ def shift(df, amount, along=None, drop_invalid=False, cols=None): checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek]) if along is not None: - if along not in df.columns: - raise ValueError( - f'Cannot do strand-aware operation: {along} column is missing.' - ) - if not df[along].isin(['+', '-', '.']).all(): - missing_strand = (~df[along].isin(['+', '-', '.'])).sum() + if isinstance(along, str): + if along not in df.columns: + raise ValueError( + f'Cannot do strand-aware operation: {along} column is missing.' + ) + strands = df[along] + else: + strands = along + + if not strands.isin(['+', '-', '.']).all(): + missing_strand = (~strands.isin(['+', '-', '.'])).sum() raise ValueError( 'Cannot do strand-aware operation: strand information missing ' f'for {missing_strand}/{df.shape[0]} ranges.' ) if not isinstance(amount, (list, tuple)): - amount = (amount, amount) + amount = (amount, amount) elif len(amount) != 2: raise ValueError( "`amount` should be a single object or a sequence of length 2; " @@ -219,19 +224,19 @@ def shift(df, amount, along=None, drop_invalid=False, cols=None): out[ek] = df[ek] + amount[1] else: out[sk] = np.where( - df[along] == '+', + strands == '+', df[sk] + amount[0], np.where( - df[along] == '-', + strands == '-', df[sk] - amount[1], df[sk] ) ) out[ek] = np.where( - df[along] == '+', + strands == '+', df[ek] + amount[1], np.where( - df[along] == '-', + strands == '-', df[ek] - amount[0], df[ek] )