Skip to content

Commit a8481b1

Browse files
authored
Merge pull request #26 from jrm5100/master
Version 0.11.0 : Add Region
2 parents 13c28d5 + 8373e58 commit a8481b1

File tree

11 files changed

+328
-10
lines changed

11 files changed

+328
-10
lines changed

docs/release-history.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
Release History
33
===============
44

5+
v0.11.0 (2021-07-21)
6+
--------------------
7+
8+
Add a *Region* scalar with support for loading from BED files and filtering
9+
510
v0.10.1 (2021-07-16)
611
--------------------
712

pandas_genomics/accessors/dataframe_accessor.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from collections import Counter
2-
from typing import Optional, List
2+
from typing import Optional, List, Union
33

44
import numpy as np
55
import pandas as pd
66

77
from .utils import calculate_edge_alphas
88
from pandas_genomics.arrays import GenotypeDtype
9+
from ..scalars import Region
910

1011

1112
@pd.api.extensions.register_dataframe_accessor("genomics")
@@ -300,3 +301,53 @@ def filter_variants_hwe(self, cutoff: float = 0.05) -> pd.DataFrame:
300301
:, (genotype_hwe_pval < cutoff) & ~np.isnan(genotype_hwe_pval)
301302
].columns
302303
return self._obj.drop(columns=removed)
304+
305+
def in_regions(self, regions: Union[Region, List[Region]]):
306+
"""
307+
Keep variants that exist in one or more of the specified regions
308+
309+
Parameters
310+
----------
311+
regions: Region or List[Region]
312+
313+
Returns
314+
-------
315+
pd.DataFrame
316+
"""
317+
if isinstance(regions, Region):
318+
regions = [
319+
regions,
320+
]
321+
genotypes = self._obj.select_dtypes([GenotypeDtype])
322+
boolean_array = genotypes.apply(lambda s: False)
323+
for r in regions:
324+
boolean_array = boolean_array | genotypes.apply(
325+
lambda s: s.genomics.contained_by(r)
326+
)
327+
removed = boolean_array[~boolean_array].index # Remove where False
328+
return self._obj.drop(columns=removed)
329+
330+
def not_in_regions(self, regions: Union[Region, List[Region]]):
331+
"""
332+
Remove variants that exist in one or more of the specified regions
333+
334+
Parameters
335+
----------
336+
regions: Region or List[Region]
337+
338+
Returns
339+
-------
340+
pd.DataFrame
341+
"""
342+
if isinstance(regions, Region):
343+
regions = [
344+
regions,
345+
]
346+
genotypes = self._obj.select_dtypes([GenotypeDtype])
347+
boolean_array = genotypes.apply(lambda s: False)
348+
for r in regions:
349+
boolean_array = boolean_array | genotypes.apply(
350+
lambda s: s.genomics.contained_by(r)
351+
)
352+
removed = boolean_array[boolean_array].index # Remove where True
353+
return self._obj.drop(columns=removed)

pandas_genomics/accessors/series_accessor.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
from typing import Optional, List
1+
from typing import Optional, List, Union
22

33
import pandas as pd
44

55
from .utils import calculate_edge_alphas
66
from pandas_genomics.arrays import GenotypeDtype
7+
from ..scalars import Region
78

89

910
@pd.api.extensions.register_series_accessor("genomics")
@@ -72,9 +73,6 @@ def maf(self):
7273
See :py:attr:`GenotypeArray.maf`"""
7374
return self._array.maf
7475

75-
#########################
76-
# Calculated Properties #
77-
#########################
7876
@property
7977
def hwe_pval(self):
8078
"""Return the probability that the samples are in HWE
@@ -226,3 +224,29 @@ def calculate_edge_encoding_values(
226224
##############
227225
# QC Methods #
228226
##############
227+
228+
# TODO
229+
230+
#################
231+
# Other Methods #
232+
#################
233+
234+
def contained_by(self, regions: Union[Region, List[Region]]):
235+
"""
236+
True if the variant is contained within the specified region(s)
237+
238+
Parameters
239+
----------
240+
regions: Region or List[Region]
241+
242+
Returns
243+
-------
244+
bool
245+
"""
246+
if isinstance(regions, Region):
247+
return regions.contains_variant(self.variant)
248+
else:
249+
for r in regions:
250+
if r.contains_variant(self.variant):
251+
return True
252+
return False

pandas_genomics/io/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
"""
22
Input/Output
33
------------
4-
The `io` module provides functions for loading and saving GenotypeArrays to common variant formats
4+
The `io` module provides functions for loading and saving GenotypeArrays or scalar values to common variant formats
55
66
.. autosummary::
77
:toctree: io
88
99
from_plink
1010
to_plink
1111
from_vcf
12+
13+
from_bed
1214
"""
1315

16+
from .bed import from_bed
1417
from .plink import from_plink, to_plink
1518
from .vcf import from_vcf
1619

17-
__all__ = ["from_plink", "to_plink", "from_vcf"]
20+
__all__ = ["from_bed", "from_plink", "to_plink", "from_vcf"]

pandas_genomics/io/bed.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from pathlib import Path
2+
from typing import Union
3+
4+
import pandas as pd
5+
6+
from pandas_genomics.scalars import Region
7+
8+
9+
def from_bed(filename: Union[str, Path]):
10+
"""
11+
Yields genomic regions from a bed file as Region scalars
12+
13+
Parameters
14+
----------
15+
filename: str or Path
16+
bed file
17+
18+
Returns
19+
-------
20+
List[Region]
21+
"""
22+
with open(filename, "r") as input:
23+
for idx, line in enumerate(input):
24+
if (
25+
line.startswith("browser")
26+
or line.startswith("track")
27+
or line.startswith("#")
28+
):
29+
continue
30+
line = line.split("\t")
31+
if len(line) < 3:
32+
raise ValueError(
33+
f"Expected at least 3 tab-delimited fields, found {len(line)} in '{line}'."
34+
)
35+
# Note that positions need to be incremented by 1 to go from 0-based to 1-based.
36+
yield Region(line[0], int(line[1]) + 1, int(line[2]) + 1, line[3])

pandas_genomics/scalars.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
"""
22
Scalars
33
-------
4-
This module contains scalar types used in the ExtensionArrays. They may also be useful on their own.
4+
This module contains scalar types, some of which are used in the ExtensionArrays. They may also be useful on their own.
55
66
.. autosummary::
77
:toctree: scalars
88
99
Variant
1010
Genotype
11+
Region
1112
"""
1213
import uuid
14+
from dataclasses import dataclass
1315
from typing import Optional, List, Tuple, Union
1416

15-
MISSING_IDX = 255 # Integer indicating a missing allele or genotype score. Each variant must have 254 alleles max and the maximum genotype score is 254.
17+
# Integer indicating a missing allele or genotype score.
18+
# Each variant must have 254 alleles max and the maximum genotype score is 254.
19+
MISSING_IDX = 255
1620

1721

1822
class Variant:
@@ -509,3 +513,51 @@ def _float_score(self):
509513
return 255
510514
else:
511515
return self.score
516+
517+
518+
@dataclass(order=True)
519+
class Region:
520+
chromosome: str
521+
start: int
522+
end: int
523+
name: str = ""
524+
"""
525+
Information associated with a genomic region.
526+
527+
Parameters
528+
----------
529+
chomosome: str
530+
start, end: int
531+
1-based, open-ended (includes start, not end)
532+
name: str
533+
name/comment for the region
534+
535+
Examples
536+
--------
537+
>>> region1 = Region('chr12', 12345, 12387)
538+
>>> region2 = Region('chr12', 5236373, 5246380)
539+
>>> print(region1 < region2)
540+
True
541+
"""
542+
543+
def __post_init__(self):
544+
"""Run after init, use to validate input"""
545+
if not isinstance(self.chromosome, str):
546+
raise TypeError(f"chromosome should be a str, not {type(self.chromosome)}")
547+
if not isinstance(self.start, int) or not isinstance(self.end, int):
548+
raise TypeError("start and end must be integers")
549+
if self.start < 1 or self.end < 1:
550+
raise ValueError("start and end must both be > 0 (1-based indexing)")
551+
if self.start >= self.end:
552+
raise ValueError(
553+
"start must be <= end, since the start is included and the end is not."
554+
)
555+
556+
def contains_variant(self, var: Variant):
557+
if var.chromosome != self.chromosome:
558+
return False
559+
if var.position < self.start:
560+
return False
561+
if var.position >= self.end:
562+
return False
563+
return True

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pandas-genomics"
3-
version = "0.10.1"
3+
version = "0.11.0"
44
description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
55
license = "BSD-3-Clause"
66
authors = ["John McGuigan <jrm5100@psu.edu>"]

tests/data/bed/bed_test.bed

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
browser position chr7:127471196-127495720
2+
browser hide all
3+
track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On"
4+
chr7 127471196 127472363 Pos1 0 + 127471196 127472363 255,0,0
5+
chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0
6+
chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0

tests/genotype_array/test_GenotypeArrayAccessors.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@
66
import pytest
77
from pandas._testing import (
88
assert_series_equal,
9+
assert_frame_equal,
910
)
1011

12+
from pandas_genomics import GenotypeArray
13+
from pandas_genomics.scalars import Variant, Region
14+
1115

1216
def test_variant_score(data, data_for_encoding):
1317
assert pd.Series(data).genomics.variant.score == float(data.variant.score)
@@ -59,3 +63,52 @@ def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_cols_left):
5963
else:
6064
result = data.genomics.filter_variants_hwe(filter_value)
6165
assert len(result.columns) == num_cols_left + 1
66+
67+
68+
def test_region_series():
69+
var = Variant("chr1", position=999, ref="A", alt=["a"])
70+
s = pd.Series(
71+
GenotypeArray(
72+
[
73+
var.make_genotype_from_str("A/A"),
74+
]
75+
* 10
76+
)
77+
)
78+
assert s.genomics.contained_by(Region("chr1", 900, 1000))
79+
assert not s.genomics.contained_by(Region("chr2", 900, 1000))
80+
assert not s.genomics.contained_by(Region("chr1", 900, 999))
81+
82+
83+
def test_region_df():
84+
var1 = Variant("chr1", position=999, ref="A", alt=["a"])
85+
var2 = Variant("chr1", position=6789, ref="A", alt=["a"])
86+
var3 = Variant("chr2", position=999, ref="A", alt=["a"])
87+
var4 = Variant("chr3", position=25622, ref="A", alt=["a"])
88+
df = pd.DataFrame(
89+
{
90+
f"var{idx+1}": GenotypeArray(
91+
[
92+
var.make_genotype_from_str("A/A"),
93+
]
94+
* 10
95+
)
96+
for idx, var in enumerate([var1, var2, var3, var4])
97+
}
98+
)
99+
assert_frame_equal(
100+
df.genomics.in_regions(Region("chr1", 900, 1000)),
101+
df[
102+
[
103+
"var1",
104+
]
105+
],
106+
)
107+
assert_frame_equal(
108+
df.genomics.not_in_regions(Region("chr1", 900, 1000)),
109+
df[["var2", "var3", "var4"]],
110+
)
111+
assert_frame_equal(
112+
df.genomics.in_regions([Region("chr1", 900, 1000), Region("chr2", 900, 1000)]),
113+
df[["var1", "var3"]],
114+
)

tests/io/test_bed.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from pathlib import Path
2+
3+
from pandas_genomics import io
4+
from pandas_genomics.scalars import Region
5+
6+
DATA_DIR = Path(__file__).parent.parent / "data" / "bed"
7+
8+
9+
def test_bed():
10+
input = DATA_DIR / "bed_test.bed"
11+
result = io.from_bed(input)
12+
assert list(result) == [
13+
Region("chr7", 127471197, 127472364, "Pos1"),
14+
Region("chr7", 127472364, 127473531, "Pos2"),
15+
Region("chr7", 127473531, 127474698, "Pos3"),
16+
]

0 commit comments

Comments
 (0)