|
4 | 4 | import sys
|
5 | 5 | from IMP import ArgumentParser
|
6 | 6 | from IMP.emseqfinder.compute_dynamic_threshold import compute_threshold
|
| 7 | +from IMP.emseqfinder.calculate_seq_match_batch import calculate_seq_match |
7 | 8 |
|
8 | 9 |
|
9 | 10 | __doc__ = "Perform all steps of the emseqfinder protocol."
|
10 | 11 |
|
11 | 12 |
|
12 |
| -def process_pdb(pdbfile): |
| 13 | +def process_pdb(pdbfile, resolution, final_output_file): |
13 | 14 | base = pathlib.Path(pdbfile.stem)
|
14 | 15 | mapfile = pathlib.Path('cryoem_maps') / base.with_suffix('.map')
|
15 | 16 | fastafile = pathlib.Path('fasta_files') / base.with_suffix('.fasta')
|
@@ -69,31 +70,73 @@ def process_pdb(pdbfile):
|
69 | 70 | print(f"[ERROR] Normalization failed. Skipping {base}.")
|
70 | 71 | return
|
71 | 72 |
|
| 73 | + # Remove old database files if exist |
| 74 | + for p in (f"{base}_ML_side.dat", f"{base}_ML_side.pkl", |
| 75 | + f"{base}_ML_side_ML_prob.dat", f"{base}_ML_output.txt"): |
| 76 | + pathlib.Path(p).unlink(missing_ok=True) |
| 77 | + |
| 78 | + # Create database |
| 79 | + p = subprocess.run( |
| 80 | + [sys.executable, '-m', |
| 81 | + 'IMP.emseqfinder.mldb.get_database_for_one_emdb_using_parts', base, |
| 82 | + f"{base}_ML_side.dat", str(resolution)]) |
| 83 | + if p.returncode != 0: |
| 84 | + print(f"[ERROR] ML DB generation failed. Skipping {base}.") |
| 85 | + return |
| 86 | + |
| 87 | + # Convert to pickle |
| 88 | + p = subprocess.run( |
| 89 | + [sys.executable, '-m', 'IMP.emseqfinder.convert_MLDB_topkl', |
| 90 | + f"{base}_ML_side.dat", f"{base}_ML_side"]) |
| 91 | + if p.returncode != 0: |
| 92 | + print(f"[ERROR] PKL conversion failed. Skipping {base}.") |
| 93 | + return |
| 94 | + |
| 95 | + # Prediction |
| 96 | + p = subprocess.run( |
| 97 | + [sys.executable, '-m', 'IMP.emseqfinder.final_ML_predict', |
| 98 | + f"{base}_ML_side.pkl", '10000']) |
| 99 | + if p.returncode != 0: |
| 100 | + print(f"[ERROR] Prediction failed. Skipping {base}.") |
| 101 | + return |
| 102 | + |
| 103 | + # Evaluate prediction |
| 104 | + p = subprocess.run( |
| 105 | + [sys.executable, '-m', 'IMP.emseqfinder.evaluate_output_database', |
| 106 | + f"{base}_ML_side_ML_prob.dat", f"{base}_ML_output.txt"]) |
| 107 | + if p.returncode != 0: |
| 108 | + print(f"[ERROR] Evaluation failed. Skipping {base}.") |
| 109 | + return |
| 110 | + |
| 111 | + # Calculate and append sequence match |
| 112 | + calculate_seq_match([f"{base}_ML_output.txt"], final_output_file) |
| 113 | + |
| 114 | + # Cleanup |
| 115 | + shutil.rmtree(frag_dir, ignore_errors=True) |
| 116 | + |
| 117 | + print(f"[INFO] Finished processing {base} ✅") |
| 118 | + print("===============================================") |
| 119 | + print("") |
| 120 | + |
72 | 121 |
|
73 | 122 | def parse_args():
|
74 | 123 | parser = ArgumentParser(
|
75 | 124 | description="Perform all steps of the emseqfinder protocol")
|
| 125 | + parser.add_argument( |
| 126 | + "--db-resolution", dest="resolution", type=float, |
| 127 | + help="Resolution used for database generation", default=4.0) |
| 128 | + |
76 | 129 | return parser.parse_args()
|
77 | 130 |
|
78 | 131 |
|
79 | 132 | def main():
|
80 | 133 | args = parse_args()
|
81 | 134 |
|
82 |
| - # Resolution used for database generation |
83 |
| - resolution = 4 |
84 |
| - |
85 | 135 | # Output file for overall results
|
86 | 136 | final_output_file = pathlib.Path("batch_matching_results.txt")
|
87 | 137 |
|
88 |
| - # Add header if file doesn’t exist |
89 |
| - if not final_output_file.exists(): |
90 |
| - with open(final_output_file, 'w') as fh: |
91 |
| - print("Result_File Total_Percentage_Matched " |
92 |
| - "Total_Abs_Percentage_Matched", file=fh) |
93 |
| - |
94 |
| - # Loop through all PDB files |
95 | 138 | for pdbfile in pathlib.Path("pdb_files").glob("*.pdb"):
|
96 |
| - process_pdb(pdbfile) |
| 139 | + process_pdb(pdbfile, args.resolution, final_output_file) |
97 | 140 |
|
98 | 141 |
|
99 | 142 | if __name__ == '__main__':
|
|
0 commit comments