Skip to content

Commit 1ff101c

Browse files
authored
Merge pull request #123 from BioPandas/mmtf-gzip
Add MMTF gzip reading support
2 parents 16172f0 + c554a46 commit 1ff101c

File tree

3 files changed

+137
-9
lines changed

3 files changed

+137
-9
lines changed

biopandas/mmtf/pandas_mmtf.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import numpy as np
1212
import pandas as pd
1313
from looseversion import LooseVersion
14-
from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse
14+
from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse, parse_gzip
1515

1616
from biopandas.constants import protein_letters_3to1_extended
1717

@@ -45,7 +45,10 @@ def df(self, value: Any):
4545
# self._df = value
4646

4747
def read_mmtf(self, filename: str):
48-
self.mmtf = parse(filename)
48+
if filename.endswith(".gz"):
49+
self.mmtf = parse_gzip(filename)
50+
else:
51+
self.mmtf = parse(filename)
4952
self.mmtf_path = filename
5053
df = self._mmtf_to_df(self.mmtf)
5154
self._df["ATOM"] = df.loc[df.record_name == "ATOM"]
@@ -496,7 +499,7 @@ def parse_mmtf(file_path: str) -> pd.DataFrame:
496499
:return: Dataframe of protein structure.
497500
:rtype: pd.DataFrame
498501
"""
499-
df = parse(file_path)
502+
df = parse_gzip(file_path) if file_path.endswith(".gz") else parse(file_path)
500503
return mmtf_to_df(df)
501504

502505

@@ -530,16 +533,28 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
530533
model_indices = mmtf_obj.chains_per_model
531534
model_indices = [sum(model_indices[:i+1]) for i in range(len(model_indices))]
532535
ch_idx = 0
536+
537+
entity_types = {}
538+
for i in mmtf_obj.entity_list:
539+
for chain in i["chainIndexList"]:
540+
entity_types[chain] = i["type"]
541+
533542
for idx, i in enumerate(mmtf_obj.group_type_list):
534543
res = mmtf_obj.group_list[i]
535-
record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
544+
#record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
545+
#record = (
546+
# "ATOM"
547+
# if res["chemCompType"] in ["L-PEPTIDE LINKING", "PEPTIDE LINKING"]
548+
# else "HETATM"
549+
#)
536550
if idx == chain_indices[ch_idx]:
537551
ch_idx += 1
552+
record = "ATOM" if entity_types[ch_idx] == "polymer" else "HETATM"
538553

539554
for _ in res["atomNameList"]:
540555
data["residue_name"].append(res["groupName"])
541556
data["residue_number"].append(mmtf_obj.group_id_list[idx])
542-
data["chain_id"].append(mmtf_obj.chain_name_list[ch_idx])
557+
data["chain_id"].append([mmtf_obj.chain_name_list[ch_idx]])
543558
data["model_id"].append(int(np.argwhere(np.array(model_indices)>ch_idx)[0]) + 1)
544559
data["record_name"].append(record)
545560
data["insertion"].append(mmtf_obj.ins_code_list[idx])
@@ -565,9 +580,13 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
565580
continue
566581
data[k] = [i for sublist in v for i in sublist]
567582

568-
return pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"])
583+
df = pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"])
584+
df.alt_loc = df.alt_loc.str.replace("\x00", "")
585+
df.insertion = df.insertion.str.replace("\x00", "")
586+
return df
569587

570588
def _seq1(seq, charmap: Dict[str, str], undef_code="X"):
589+
# sourcery skip: dict-assign-update-to-union
571590
"""Convert protein sequence from three-letter to one-letter code.
572591
The single required input argument 'seq' should be a protein sequence
573592
using three-letter codes, either as a Python string or as a Seq or
@@ -650,7 +669,6 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
650669

651670
node_ids = df.model_id.astype(str) + ":" + df.chain_id + ":" + df.residue_name + ":" + df.residue_number.astype(str) + ":" + df.insertion.astype(str)
652671
df["residue_id"] = node_ids
653-
654672
# Tracks values to replace them at the end
655673
chains_per_model = []
656674
groups_per_chain = []
@@ -750,7 +768,7 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
750768
encoder.set_atom_info(
751769
atom_name=row.atom_name,
752770
serial_number=row.atom_number,
753-
alternative_location_id="\x00" if row.alt_loc == " " else row.alt_loc,
771+
alternative_location_id="\x00" if row.alt_loc == "" else row.alt_loc,
754772
x=row.x_coord,
755773
y=row.y_coord,
756774
z=row.z_coord,
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# BioPandas
2+
# Author: Sebastian Raschka <mail@sebastianraschka.com>
3+
# License: BSD 3 clause
4+
# Project Website: http://rasbt.github.io/biopandas/
5+
# Code Repository: https://github.com/rasbt/biopandas
6+
7+
8+
import os
9+
from urllib.error import HTTPError, URLError
10+
from urllib.request import urlopen
11+
12+
import numpy as np
13+
import pandas as pd
14+
from nose.tools import raises
15+
16+
from biopandas.mmtf import PandasMmtf
17+
from biopandas.pdb import PandasPdb
18+
from biopandas.testutils import assert_raises
19+
20+
MMTF_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf")
21+
MMTF_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf.gz")
22+
23+
PDB_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb")
24+
PDB_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb.gz")
25+
26+
27+
ATOM_DF_COLUMNS = [
28+
"record_name",
29+
"atom_number",
30+
"atom_name",
31+
#"alt_loc",
32+
"residue_name",
33+
"chain_id",
34+
"residue_number",
35+
#"insertion",
36+
"x_coord",
37+
"y_coord",
38+
"z_coord",
39+
"occupancy",
40+
"b_factor",
41+
"element_symbol",
42+
#"charge",
43+
]
44+
45+
def test_fetch_pdb():
46+
"""Test fetch_pdb"""
47+
ppdb = PandasMmtf()
48+
ppdb.fetch_mmtf("3eiy")
49+
assert max(ppdb.df["ATOM"].residue_number) == 175
50+
51+
52+
def test__read_mmtf():
53+
"""Test public _read_pdb with gzip files"""
54+
pmmtf = PandasMmtf()
55+
ppdb = PandasPdb()
56+
pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME)
57+
58+
ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME)
59+
60+
pd.testing.assert_frame_equal(
61+
pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
62+
ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
63+
)
64+
65+
ATOM_DF_COLUMNS.remove("atom_number")
66+
ATOM_DF_COLUMNS.remove("element_symbol")
67+
pd.testing.assert_frame_equal(
68+
pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
69+
ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
70+
)
71+
72+
73+
def test__read_mmtf_gz():
74+
"""Test public _read_pdb with gzip files"""
75+
pmmtf = PandasMmtf()
76+
ppdb = PandasPdb()
77+
pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME_GZ)
78+
ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME_GZ)
79+
80+
81+
pmmtf.df["ATOM"].alt_loc.replace('\x00', "", inplace=True)
82+
pmmtf.df["HETATM"].alt_loc.replace('\x00', "", inplace=True)
83+
84+
pd.testing.assert_frame_equal(
85+
pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
86+
ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
87+
)
88+
#pd.testing.assert_frame_equal(
89+
# pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
90+
# ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
91+
# )
92+
93+
94+
def test_read_mmtf():
95+
"""Test public read_pdb"""
96+
ppdb = PandasMmtf()
97+
ppdb.read_mmtf(MMTF_TESTDATA_FILENAME)
98+
assert ppdb.mmtf_path == MMTF_TESTDATA_FILENAME
99+
100+
101+
102+

docs/CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@ The CHANGELOG for the current development version is available at
44
[https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md](https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md).
55

66

7-
### 0.5.0dev (UNRELEASED)
7+
### 0.5.0dev1 (11/4/2023)
8+
9+
- Adds supprt for reading Gzipped MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
10+
- Improves reliability of parsing polymer/non-polymer entities in MMTF parsing. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
11+
- Improves reliability of parsing multicharacter chain IDs from MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
12+
- Replaces null terminator chars in parsed MMTF dataframe with the empty string. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
13+
14+
15+
### 0.5.0dev0 (3/4/2023)
816

917
##### Downloads
1018

0 commit comments

Comments
 (0)