Skip to content

Commit 7ccc076

Browse files
Merge pull request #34 from hgb-bin-proteomics/develop
[Post Processing] Add sequence coverage to output
2 parents 2024953 + 5dcf8dc commit 7ccc076

File tree

2 files changed

+120
-7
lines changed

2 files changed

+120
-7
lines changed

POSTPROCESSING.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,15 @@ The following additional columns are annotated:
7070
- `PP.IsDecoyA`: If peptide A is a decoy hit, may be `True` or `False`
7171
- `PP.IsDecoyB`: If peptide B is a decoy hit, may be `True` or `False`
7272
- `PP.SourceScanID`: The corresponding scan number in the spectral library
73+
- `PP.SequenceCoverageNTermAlpha`: Sequence coverage of the alpha peptide covered by n-terminal ions (range: 0-1)
74+
- `PP.SequenceCoverageNTermBeta`: Sequence coverage of the beta peptide covered by n-terminal ions (range: 0-1)
75+
- `PP.SequenceCoverageNTermFull`: Sequence coverage of the full crosslink covered by n-terminal ions (range: 0-1)
76+
- `PP.SequenceCoverageCTermAlpha`: Sequence coverage of the alpha peptide covered by c-terminal ions (range: 0-1)
77+
- `PP.SequenceCoverageCTermBeta`: Sequence coverage of the beta peptide covered by c-terminal ions (range: 0-1)
78+
- `PP.SequenceCoverageCTermFull`: Sequence coverage of the full crosslink covered by c-terminal ions (range: 0-1)
79+
- `PP.SequenceCoverageAlpha`: Sequence coverage of the alpha peptide covered by all ions (range: 0-1)
80+
- `PP.SequenceCoverageBeta`: Sequence coverage of the beta peptide covered by all ions (range: 0-1)
81+
- `PP.SequenceCoverageFull`: Sequence coverage of the full crosslink covered by all ions (range: 0-1)
7382
- `PP.PseudoScanNumber`: An iterative number that acts as an ID to create pseudo CSMs
7483
- `PP.Crosslinker`: Name of the crosslinker
7584
- `PP.CrosslinkerMass`: Delta mass of the crosslinker

post_process.py

Lines changed: 111 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88

99
# version tracking
10-
__version = "1.1.5"
11-
__date = "2025-03-26"
10+
__version = "1.2.1"
11+
__date = "2025-07-23"
1212

1313
# PARAMETERS
1414

@@ -95,24 +95,31 @@ def read_spectral_library(filename: str) -> Dict[str, Dict[str, Any]]:
9595
def generate_fragment_index(spectronaut: pd.DataFrame, index: dict) -> Dict[str, Dict[str, Any]]:
9696
# dict keeping track of annotated fragments for every unique crosslink id
9797
fragment_annotation = dict()
98+
# nr of spectronaut ions that could not be matched
99+
nr_unmatched = 0
98100
for i, row in tqdm(spectronaut.iterrows(), total = spectronaut.shape[0], desc = "Generating fragment ion index..."):
99101
# unique crosslink id
100102
key = get_key_spectronaut(row)
103+
# if fragment annotation struct does not have entry for crosslink id, create one that is empty
104+
if key not in fragment_annotation:
105+
fragment_annotation[key] = {"matched_number_ions_a": 0,
106+
"matched_number_ions_b": 0,
107+
"fragments": list(),
108+
"ion_types": set()}
101109
# current fragment ion from spectronaut row
102110
ion = float(row[SPECTRONAUT_FRAGMENT_MZ_COLUMN_NAME])
103111
# all spectral library ions for the crosslink id
104112
# this is a dict that matches ion mass -> list(fragments_with_that_mass[pd.Series])
105113
ions = index[key]["ions"]
114+
# has ion been matched?
115+
matched = False
106116
# for every ion mass of the crosslink id in the spec lib, do:
107117
for current_ion_mz in ions.keys():
108118
fragment_key_part = get_fragment_key(current_ion_mz)
109119
# check if spectronaut fragment is within tolerance bounds of spec lib ion
110120
if round(current_ion_mz, 4) > round(ion - SPECTRONAUT_MATCH_TOLERANCE, 4) and round(current_ion_mz, 4) < round(ion + SPECTRONAUT_MATCH_TOLERANCE, 4):
111-
# if fragment annotation struct does not have entry for crosslink id, create one that is empty
112-
if key not in fragment_annotation:
113-
fragment_annotation[key] = {"matched_number_ions_a": 0,
114-
"matched_number_ions_b": 0,
115-
"fragments": []}
121+
# ion has been matched
122+
matched = True
116123
# for all ions that match mass, do:
117124
for current_ion in ions[current_ion_mz]:
118125
# if ion of peptide a
@@ -123,15 +130,24 @@ def generate_fragment_index(spectronaut: pd.DataFrame, index: dict) -> Dict[str,
123130
if fragment_key not in fragment_annotation[key]["fragments"]:
124131
fragment_annotation[key]["matched_number_ions_a"] += 1
125132
fragment_annotation[key]["fragments"].append(fragment_key)
133+
fragment_annotation[key]["ion_types"].add(
134+
f"{current_ion['FragmentType']};{current_ion['FragmentNumber']};0"
135+
)
126136
else:
127137
# same for peptide b
128138
fragment_key = fragment_key_part + "_1"
129139
if fragment_key not in fragment_annotation[key]["fragments"]:
130140
fragment_annotation[key]["matched_number_ions_b"] += 1
131141
fragment_annotation[key]["fragments"].append(fragment_key)
142+
fragment_annotation[key]["ion_types"].add(
143+
f"{current_ion['FragmentType']};{current_ion['FragmentNumber']};1"
144+
)
132145
# we do not break here, because we want to check the rest of the spec lib ions in case the spectronaut ion matches
133146
# for both peptide a and peptide b
147+
if not matched:
148+
nr_unmatched += 1
134149

150+
print(f"[Fragment index] Nr. of unmatched ions: {nr_unmatched} <-> {(nr_unmatched / spectronaut.shape[0]) * 100} %")
135151
return fragment_annotation
136152

137153
def annotate_spectronaut_result(filename: str) -> pd.DataFrame:
@@ -372,6 +388,94 @@ def annotate_SourceScanID(row: pd.Series, index: dict) -> int:
372388
tqdm.pandas(desc = "Annotating spectral library source scan ID...")
373389
spectronaut["PP.SourceScanID"] = spectronaut.progress_apply(lambda row: annotate_SourceScanID(row, index), axis = 1)
374390

391+
def annotate_SequenceCoverageNTerm(row: pd.Series, fragment_annotation: dict, alpha: bool) -> float:
392+
key = get_key_spectronaut(row)
393+
ion_types = fragment_annotation[key]["ion_types"]
394+
peptide = str(row["PP.PeptideA"]).strip() if alpha else str(row["PP.PeptideB"]).strip()
395+
pep_id_lookup = 0 if alpha else 1
396+
ions = list()
397+
for ion in ion_types:
398+
pep_id = int(ion.split(";")[2])
399+
ion_type = str(ion.split(";")[0]).strip()
400+
ion_number = int(ion.split(";")[1])
401+
if len(ion_type) != 1:
402+
raise RuntimeError(f"Could not parse ion type from ion {ion}!")
403+
if pep_id == pep_id_lookup and ion_type in ["a", "b", "c"]:
404+
ions.append((ion_type, ion_number))
405+
return len(ions) / len(peptide)
406+
407+
tqdm.pandas(desc = "Annotating n-terminal sequence coverage for alpha peptide...")
408+
spectronaut["PP.SequenceCoverageNTermAlpha"] = spectronaut.progress_apply(lambda row: annotate_SequenceCoverageNTerm(row, fragment_annotation, True), axis = 1)
409+
410+
tqdm.pandas(desc = "Annotating n-terminal sequence coverage for beta peptide...")
411+
spectronaut["PP.SequenceCoverageNTermBeta"] = spectronaut.progress_apply(lambda row: annotate_SequenceCoverageNTerm(row, fragment_annotation, False), axis = 1)
412+
413+
tqdm.pandas(desc = "Annotating n-terminal sequence coverage for full crosslink...")
414+
spectronaut["PP.SequenceCoverageNTermFull"] = spectronaut.progress_apply(lambda row: (float(row["PP.SequenceCoverageNTermAlpha"]) + float(row["PP.SequenceCoverageNTermBeta"])) / 2.0, axis = 1)
415+
416+
def annotate_SequenceCoverageCTerm(row: pd.Series, fragment_annotation: dict, alpha: bool) -> float:
417+
key = get_key_spectronaut(row)
418+
ion_types = fragment_annotation[key]["ion_types"]
419+
peptide = str(row["PP.PeptideA"]).strip() if alpha else str(row["PP.PeptideB"]).strip()
420+
pep_id_lookup = 0 if alpha else 1
421+
ions = list()
422+
for ion in ion_types:
423+
pep_id = int(ion.split(";")[2])
424+
ion_type = str(ion.split(";")[0]).strip()
425+
ion_number = int(ion.split(";")[1])
426+
if len(ion_type) != 1:
427+
raise RuntimeError(f"Could not parse ion type from ion {ion}!")
428+
if pep_id == pep_id_lookup and ion_type in ["x", "y", "z"]:
429+
ions.append((ion_type, ion_number))
430+
return len(ions) / len(peptide)
431+
432+
tqdm.pandas(desc = "Annotating c-terminal sequence coverage for alpha peptide...")
433+
spectronaut["PP.SequenceCoverageCTermAlpha"] = spectronaut.progress_apply(lambda row: annotate_SequenceCoverageCTerm(row, fragment_annotation, True), axis = 1)
434+
435+
tqdm.pandas(desc = "Annotating c-terminal sequence coverage for beta peptide...")
436+
spectronaut["PP.SequenceCoverageCTermBeta"] = spectronaut.progress_apply(lambda row: annotate_SequenceCoverageCTerm(row, fragment_annotation, False), axis = 1)
437+
438+
tqdm.pandas(desc = "Annotating c-terminal sequence coverage for full crosslink...")
439+
spectronaut["PP.SequenceCoverageCTermFull"] = spectronaut.progress_apply(lambda row: (float(row["PP.SequenceCoverageCTermAlpha"]) + float(row["PP.SequenceCoverageCTermBeta"])) / 2.0, axis = 1)
440+
441+
def annotate_SequenceCoverage(row: pd.Series, fragment_annotation: dict, alpha: bool) -> float:
442+
#
443+
# 1 2 3 4 5 6 7
444+
# P E P T I D E
445+
# b1 b2 b3 b4 b5 b6
446+
# y6 y5 y4 y3 y2 y1
447+
#
448+
# b[x] = y[len+1-x]
449+
#
450+
key = get_key_spectronaut(row)
451+
ion_types = fragment_annotation[key]["ion_types"]
452+
peptide = str(row["PP.PeptideA"]).strip() if alpha else str(row["PP.PeptideB"]).strip()
453+
pep_id_lookup = 0 if alpha else 1
454+
unique_seq_positions = set()
455+
for ion in ion_types:
456+
pep_id = int(ion.split(";")[2])
457+
ion_type = str(ion.split(";")[0]).strip()
458+
ion_number = int(ion.split(";")[1])
459+
if len(ion_type) != 1:
460+
raise RuntimeError(f"Could not parse ion type from ion {ion}!")
461+
if pep_id == pep_id_lookup:
462+
if ion_type in ["a", "b", "c"]:
463+
unique_seq_positions.add(ion_number)
464+
elif ion_type in ["x", "y", "z"]:
465+
unique_seq_positions.add(len(peptide) + 1 - ion_number)
466+
else:
467+
raise RuntimeError(f"Found not-suppored ion type: {ion_type}")
468+
return len(unique_seq_positions) / len(peptide)
469+
470+
tqdm.pandas(desc = "Annotating sequence coverage for alpha peptide...")
471+
spectronaut["PP.SequenceCoverageAlpha"] = spectronaut.progress_apply(lambda row: annotate_SequenceCoverage(row, fragment_annotation, True), axis = 1)
472+
473+
tqdm.pandas(desc = "Annotating sequence coverage for beta peptide...")
474+
spectronaut["PP.SequenceCoverageBeta"] = spectronaut.progress_apply(lambda row: annotate_SequenceCoverage(row, fragment_annotation, False), axis = 1)
475+
476+
tqdm.pandas(desc = "Annotating sequence coverage for full crosslink...")
477+
spectronaut["PP.SequenceCoverageFull"] = spectronaut.progress_apply(lambda row: (float(row["PP.SequenceCoverageAlpha"]) + float(row["PP.SequenceCoverageBeta"])) / 2.0, axis = 1)
478+
375479
spectronaut["PP.PseudoScanNumber"] = pd.Series(range(spectronaut.shape[0]))
376480
spectronaut["PP.Crosslinker"] = pd.Series([CROSSLINKER for i in range(spectronaut.shape[0])])
377481
spectronaut["PP.CrosslinkerMass"] = pd.Series([CROSSLINKER_MASS for i in range(spectronaut.shape[0])])

0 commit comments

Comments
 (0)