Merge pull request #123 from BioPandas/mmtf-gzip

a-r-j · web-flow · commit 1ff101cb393e · 2023-04-17T03:56:14.000+01:00
Add MMTF gzip reading support
diff --git a/biopandas/mmtf/pandas_mmtf.py b/biopandas/mmtf/pandas_mmtf.py
@@ -11,7 +11,7 @@
 import numpy as np
 import pandas as pd
 from looseversion import LooseVersion
-from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse
+from mmtf import MMTFDecoder, MMTFEncoder, fetch, parse, parse_gzip
 
 from biopandas.constants import protein_letters_3to1_extended
 
@@ -45,7 +45,10 @@ def df(self, value: Any):
         # self._df = value
 
     def read_mmtf(self, filename: str):
-        self.mmtf = parse(filename)
+        if filename.endswith(".gz"):
+            self.mmtf = parse_gzip(filename)
+        else:
+            self.mmtf = parse(filename)
         self.mmtf_path = filename
         df = self._mmtf_to_df(self.mmtf)
         self._df["ATOM"] = df.loc[df.record_name == "ATOM"]
@@ -496,7 +499,7 @@ def parse_mmtf(file_path: str) -> pd.DataFrame:
     :return: Dataframe of protein structure.
     :rtype: pd.DataFrame
     """
-    df = parse(file_path)
+    df = parse_gzip(file_path) if file_path.endswith(".gz") else parse(file_path)
     return mmtf_to_df(df)
 
 
@@ -530,16 +533,28 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
     model_indices = mmtf_obj.chains_per_model
     model_indices = [sum(model_indices[:i+1]) for i in range(len(model_indices))]
     ch_idx = 0
+
+    entity_types = {}
+    for i in mmtf_obj.entity_list:
+        for chain in i["chainIndexList"]:
+            entity_types[chain] = i["type"]
+
     for idx, i in enumerate(mmtf_obj.group_type_list):
         res = mmtf_obj.group_list[i]
-        record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
+        #record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
+        #record = (
+        #    "ATOM"
+        #    if res["chemCompType"] in ["L-PEPTIDE LINKING", "PEPTIDE LINKING"]
+        #    else "HETATM"
+        #)
         if idx == chain_indices[ch_idx]:
             ch_idx += 1
+        record = "ATOM" if entity_types[ch_idx] == "polymer" else "HETATM"
 
         for _ in res["atomNameList"]:
             data["residue_name"].append(res["groupName"])
             data["residue_number"].append(mmtf_obj.group_id_list[idx])
-            data["chain_id"].append(mmtf_obj.chain_name_list[ch_idx])
+            data["chain_id"].append([mmtf_obj.chain_name_list[ch_idx]])
             data["model_id"].append(int(np.argwhere(np.array(model_indices)>ch_idx)[0]) + 1)
             data["record_name"].append(record)
             data["insertion"].append(mmtf_obj.ins_code_list[idx])
@@ -565,9 +580,13 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
             continue
         data[k] = [i for sublist in v for i in sublist]
 
-    return pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"])
+    df = pd.DataFrame.from_dict(data).sort_values(by=["model_id", "atom_number"])
+    df.alt_loc = df.alt_loc.str.replace("\x00", "")
+    df.insertion = df.insertion.str.replace("\x00", "")
+    return df
 
 def _seq1(seq, charmap: Dict[str, str], undef_code="X"):
+    # sourcery skip: dict-assign-update-to-union
     """Convert protein sequence from three-letter to one-letter code.
     The single required input argument 'seq' should be a protein sequence
     using three-letter codes, either as a Python string or as a Seq or
@@ -650,7 +669,6 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
 
     node_ids = df.model_id.astype(str) + ":" + df.chain_id + ":" + df.residue_name + ":" + df.residue_number.astype(str) + ":" + df.insertion.astype(str)
     df["residue_id"] = node_ids
-
     # Tracks values to replace them at the end
     chains_per_model = []
     groups_per_chain = []
@@ -750,7 +768,7 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
                     encoder.set_atom_info(
                         atom_name=row.atom_name,
                         serial_number=row.atom_number,
-                        alternative_location_id="\x00" if row.alt_loc == " " else row.alt_loc,
+                        alternative_location_id="\x00" if row.alt_loc == "" else row.alt_loc,
                         x=row.x_coord,
                         y=row.y_coord,
                         z=row.z_coord,
diff --git a/biopandas/mmtf/tests/test_read_mmtf.py b/biopandas/mmtf/tests/test_read_mmtf.py
@@ -0,0 +1,102 @@
+# BioPandas
+# Author: Sebastian Raschka <mail@sebastianraschka.com>
+# License: BSD 3 clause
+# Project Website: http://rasbt.github.io/biopandas/
+# Code Repository: https://github.com/rasbt/biopandas
+
+
+import os
+from urllib.error import HTTPError, URLError
+from urllib.request import urlopen
+
+import numpy as np
+import pandas as pd
+from nose.tools import raises
+
+from biopandas.mmtf import PandasMmtf
+from biopandas.pdb import PandasPdb
+from biopandas.testutils import assert_raises
+
+MMTF_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf")
+MMTF_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "data", "3eiy.mmtf.gz")
+
+PDB_TESTDATA_FILENAME = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb")
+PDB_TESTDATA_FILENAME_GZ = os.path.join(os.path.dirname(__file__), "..", "..", "pdb", "tests", "data", "3eiy.pdb.gz")
+
+
+ATOM_DF_COLUMNS = [
+    "record_name",
+    "atom_number",
+    "atom_name",
+    #"alt_loc",
+    "residue_name",
+    "chain_id",
+    "residue_number",
+    #"insertion",
+    "x_coord",
+    "y_coord",
+    "z_coord",
+    "occupancy",
+    "b_factor",
+    "element_symbol",
+    #"charge",
+]
+
+def test_fetch_pdb():
+    """Test fetch_pdb"""
+    ppdb = PandasMmtf()
+    ppdb.fetch_mmtf("3eiy")
+    assert max(ppdb.df["ATOM"].residue_number) == 175
+
+
+def test__read_mmtf():
+    """Test public _read_pdb with gzip files"""
+    pmmtf = PandasMmtf()
+    ppdb = PandasPdb()
+    pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME)
+
+    ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME)
+
+    pd.testing.assert_frame_equal(
+        pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+        ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+        )
+
+    ATOM_DF_COLUMNS.remove("atom_number")
+    ATOM_DF_COLUMNS.remove("element_symbol")
+    pd.testing.assert_frame_equal(
+        pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+        ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+        )
+
+
+def test__read_mmtf_gz():
+    """Test public _read_pdb with gzip files"""
+    pmmtf = PandasMmtf()
+    ppdb = PandasPdb()
+    pmmtf.read_mmtf(MMTF_TESTDATA_FILENAME_GZ)
+    ppdb = ppdb.read_pdb(PDB_TESTDATA_FILENAME_GZ)
+
+
+    pmmtf.df["ATOM"].alt_loc.replace('\x00', "", inplace=True)
+    pmmtf.df["HETATM"].alt_loc.replace('\x00', "", inplace=True)
+
+    pd.testing.assert_frame_equal(
+        pmmtf.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+        ppdb.df["ATOM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+        )
+    #pd.testing.assert_frame_equal(
+    #    pmmtf.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+    #    ppdb.df["HETATM"][ATOM_DF_COLUMNS].reset_index(drop=True),
+    #    )
+
+
+def test_read_mmtf():
+    """Test public read_pdb"""
+    ppdb = PandasMmtf()
+    ppdb.read_mmtf(MMTF_TESTDATA_FILENAME)
+    assert ppdb.mmtf_path == MMTF_TESTDATA_FILENAME
+
+
+
+
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -4,7 +4,15 @@ The CHANGELOG for the current development version is available at
 [https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md](https://github.com/rasbt/biopandas/blob/main/docs/sources/CHANGELOG.md).
 
 
-### 0.5.0dev (UNRELEASED)
+### 0.5.0dev1 (11/4/2023)
+
+- Adds supprt for reading Gzipped MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
+- Improves reliability of parsing polymer/non-polymer entities in MMTF parsing. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
+- Improves reliability of parsing multicharacter chain IDs from MMTF files. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
+- Replaces null terminator chars in parsed MMTF dataframe with the empty string. (Via [Arian Jamasb](https://github.com/a-r-j), PR #[123](https://github.com/rasbt/biopandas/pull/123/files))
+
+
+### 0.5.0dev0 (3/4/2023)
 
 ##### Downloads