11
11
import numpy as np
12
12
import pandas as pd
13
13
from looseversion import LooseVersion
14
- from mmtf import MMTFDecoder , MMTFEncoder , fetch , parse
14
+ from mmtf import MMTFDecoder , MMTFEncoder , fetch , parse , parse_gzip
15
15
16
16
from biopandas .constants import protein_letters_3to1_extended
17
17
@@ -45,7 +45,10 @@ def df(self, value: Any):
45
45
# self._df = value
46
46
47
47
def read_mmtf (self , filename : str ):
48
- self .mmtf = parse (filename )
48
+ if filename .endswith (".gz" ):
49
+ self .mmtf = parse_gzip (filename )
50
+ else :
51
+ self .mmtf = parse (filename )
49
52
self .mmtf_path = filename
50
53
df = self ._mmtf_to_df (self .mmtf )
51
54
self ._df ["ATOM" ] = df .loc [df .record_name == "ATOM" ]
@@ -496,7 +499,7 @@ def parse_mmtf(file_path: str) -> pd.DataFrame:
496
499
:return: Dataframe of protein structure.
497
500
:rtype: pd.DataFrame
498
501
"""
499
- df = parse (file_path )
502
+ df = parse_gzip ( file_path ) if file_path . endswith ( ".gz" ) else parse (file_path )
500
503
return mmtf_to_df (df )
501
504
502
505
@@ -530,16 +533,28 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
530
533
model_indices = mmtf_obj .chains_per_model
531
534
model_indices = [sum (model_indices [:i + 1 ]) for i in range (len (model_indices ))]
532
535
ch_idx = 0
536
+
537
+ entity_types = {}
538
+ for i in mmtf_obj .entity_list :
539
+ for chain in i ["chainIndexList" ]:
540
+ entity_types [chain ] = i ["type" ]
541
+
533
542
for idx , i in enumerate (mmtf_obj .group_type_list ):
534
543
res = mmtf_obj .group_list [i ]
535
- record = "HETATM" if res ["chemCompType" ] == "NON-POLYMER" else "ATOM"
544
+ #record = "HETATM" if res["chemCompType"] == "NON-POLYMER" else "ATOM"
545
+ #record = (
546
+ # "ATOM"
547
+ # if res["chemCompType"] in ["L-PEPTIDE LINKING", "PEPTIDE LINKING"]
548
+ # else "HETATM"
549
+ #)
536
550
if idx == chain_indices [ch_idx ]:
537
551
ch_idx += 1
552
+ record = "ATOM" if entity_types [ch_idx ] == "polymer" else "HETATM"
538
553
539
554
for _ in res ["atomNameList" ]:
540
555
data ["residue_name" ].append (res ["groupName" ])
541
556
data ["residue_number" ].append (mmtf_obj .group_id_list [idx ])
542
- data ["chain_id" ].append (mmtf_obj .chain_name_list [ch_idx ])
557
+ data ["chain_id" ].append ([ mmtf_obj .chain_name_list [ch_idx ] ])
543
558
data ["model_id" ].append (int (np .argwhere (np .array (model_indices )> ch_idx )[0 ]) + 1 )
544
559
data ["record_name" ].append (record )
545
560
data ["insertion" ].append (mmtf_obj .ins_code_list [idx ])
@@ -565,9 +580,13 @@ def mmtf_to_df(mmtf_obj: MMTFDecoder) -> pd.DataFrame:
565
580
continue
566
581
data [k ] = [i for sublist in v for i in sublist ]
567
582
568
- return pd .DataFrame .from_dict (data ).sort_values (by = ["model_id" , "atom_number" ])
583
+ df = pd .DataFrame .from_dict (data ).sort_values (by = ["model_id" , "atom_number" ])
584
+ df .alt_loc = df .alt_loc .str .replace ("\x00 " , "" )
585
+ df .insertion = df .insertion .str .replace ("\x00 " , "" )
586
+ return df
569
587
570
588
def _seq1 (seq , charmap : Dict [str , str ], undef_code = "X" ):
589
+ # sourcery skip: dict-assign-update-to-union
571
590
"""Convert protein sequence from three-letter to one-letter code.
572
591
The single required input argument 'seq' should be a protein sequence
573
592
using three-letter codes, either as a Python string or as a Seq or
@@ -650,7 +669,6 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
650
669
651
670
node_ids = df .model_id .astype (str ) + ":" + df .chain_id + ":" + df .residue_name + ":" + df .residue_number .astype (str ) + ":" + df .insertion .astype (str )
652
671
df ["residue_id" ] = node_ids
653
-
654
672
# Tracks values to replace them at the end
655
673
chains_per_model = []
656
674
groups_per_chain = []
@@ -750,7 +768,7 @@ def write_mmtf(df: pd.DataFrame, file_path: str):
750
768
encoder .set_atom_info (
751
769
atom_name = row .atom_name ,
752
770
serial_number = row .atom_number ,
753
- alternative_location_id = "\x00 " if row .alt_loc == " " else row .alt_loc ,
771
+ alternative_location_id = "\x00 " if row .alt_loc == "" else row .alt_loc ,
754
772
x = row .x_coord ,
755
773
y = row .y_coord ,
756
774
z = row .z_coord ,
0 commit comments