Merge pull request #37 from hgb-bin-proteomics/develop

michabirklbauer · web-flow · commit 4f8435de5411 · 2025-08-04T15:03:34.000+02:00
Post Process v1.2.4
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -27,6 +27,7 @@ jobs:
     - name: Copy scripts and data to "/tests"
       run: |
         cp create_spectral_library.py tests
+        cp post_process.py tests
         cp config.py tests
         cp data/test_filter.xlsx .
         cp data/test_reverse_mods.xlsx .
diff --git a/POSTPROCESSING.md b/POSTPROCESSING.md
@@ -87,6 +87,17 @@ The following additional columns are annotated:
 - `PP.SequenceCoverageAlpha`: Sequence coverage of the alpha peptide covered by all ions (range: 0-1)
 - `PP.SequenceCoverageBeta`: Sequence coverage of the beta peptide covered by all ions (range: 0-1)
 - `PP.SequenceCoverageFull`: Sequence coverage of the full crosslink covered by all ions (range: 0-1)
+- `PP.UniScoreAlpha`: The [UniScore](https://doi.org/10.1016/j.mcpro.2025.101010) of the alpha peptide
+- `PP.UniScoreBeta`: The [UniScore](https://doi.org/10.1016/j.mcpro.2025.101010) of the beta peptide
+- `PP.UniScoreFull`: The [UniScore](https://doi.org/10.1016/j.mcpro.2025.101010) of the crosslink, which is the minimum UniScore
+- `PP.PepLenAlpha`: The length of the alpha peptide (number of amino acids)
+- `PP.PepLenBeta`: The length of the beta peptide (number of amino acids)
+- `PP.NumberCrosslinkFragmentsAlpha`: The number of fragment ions that contain a crosslink modification for the alpha peptide
+- `PP.NumberCrosslinkFragmentsBeta`: The number of fragment ions that contain a crosslink modification for the beta peptide
+- `PP.NumberCrosslinkFragmentsFull`: The number of fragment ions that contain a crosslink modification for the full crosslink
+- `PP.NormalizedCrosslinkFragmentsAlpha`: `PP.NumberCrosslinkFragmentsAlpha` but normalized by total ion count
+- `PP.NormalizedCrosslinkFragmentsBeta`: `PP.NumberCrosslinkFragmentsBeta` but normalized by total ion count
+- `PP.NormalizedCrosslinkFragmentsFull`: `PP.NumberCrosslinkFragmentsFull` but normalized by the sum of total ion counts
 - `PP.PseudoScanNumber`: An iterative number that acts as an ID to create pseudo CSMs
 - `PP.Crosslinker`: Name of the crosslinker
 - `PP.CrosslinkerMass`: Delta mass of the crosslinker
diff --git a/README.md b/README.md
@@ -29,29 +29,6 @@ Starting with version [1.4.4](https://github.com/hgb-bin-proteomics/MSAnnika_Spe
 [xiSearch](https://www.rappsilberlab.org/software/xisearch/) with [xiFDR](https://www.rappsilberlab.org/software/xifdr/). Simply use the validated CSMs file from
 xiFDR (e.g. usually ending with extension `CSM_xiFDR*.*.*.csv` where `*` denotes the xiFDR version) as input for the `CSMS_FILE` parameter in the `config.py` file!
 
-## GUI
-
-![Screenshot](gui/screenshot.png)
-
-> [!Important]
-> **The GUI currently only is supported up to version [1.1.6](https://github.com/hgb-bin-proteomics/MSAnnika_Spectral_Library_exporter/releases/tag/v1.1.6)!**
->
-
-Alternatively to the commandline-based python script, a GUI is also available via [Docker](https://www.docker.com/):
-- After [installing Docker](https://docs.docker.com/engine/install/) [[Quick Guide here](https://github.com/michabirklbauer/PIA/blob/master/DOCKER.md)] run the following command:
-  ```
-  docker run -p 8501:8501 michabirklbauer/spectrallibraryexporter
-  ```
-- Navigate to `localhost:8501` in your browser. You should see the MS Annika Spectral Library exporter GUI!
-
-If you don't have/want to install Docker you can also run the GUI natively using the following commands:
-- Open a terminal inside `MSAnnika_Spectral_Library_exporter`.
-- Enter `cp gui/streamlit_app.py .`.
-- Enter `cp gui/streamlit_util.py .`.
-- Enter `pip install streamlit`.
-- Enter `streamlit run streamlit_app.py --server.maxUploadSize 5000`.
-- Navigate to `localhost:8501` in your browser. You should see the MS Annika Spectral Library exporter GUI!
-
 ## Exporting MS Annika results to Microsoft Excel
 
 The script uses a Micrsoft Excel files as input, for that MS Annika results need to be exported from Proteome Discoverer. It is recommended to first filter results according to your needs, e.g. filter for high-confidence CSMs and filter out decoy CSMs as depicted below.
diff --git a/create_spectral_library.py b/create_spectral_library.py
@@ -1,4 +1,14 @@
 #!/usr/bin/env python3
+#
+# /// script
+# requires-python = ">=3.7"
+# dependencies = [
+#   "pandas",
+#   "openpyxl",
+#   "tqdm",
+#   "pyteomics",
+# ]
+# ///
 
 # MS ANNIKA SPECTRAL LIBRARY EXPORTER
 # 2023 (c) Micha Johannes Birklbauer
diff --git a/post_process.py b/post_process.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+#
 # /// script
 # requires-python = ">=3.7"
 # dependencies = [
@@ -14,8 +15,8 @@
 
 
 # version tracking
-__version = "1.2.2"
-__date = "2025-07-29"
+__version = "1.2.6"
+__date = "2025-08-04"
 
 # PARAMETERS
 
@@ -48,6 +49,37 @@ def get_mz_key(mz: float) -> float:
 def get_fragment_key(mz: float) -> str:
     return f"{round(mz, 4):.4f}"
 
+def get_kmers(unique_seq_positions: set) -> list:
+    sorted_pos = sorted(unique_seq_positions)
+    kmers = list()
+    current_kmer = 1
+    for i, pos in enumerate(sorted_pos):
+        if i + 1 < len(unique_seq_positions):
+            if sorted_pos[i + 1] == pos + 1:
+                current_kmer += 1
+            else:
+                if current_kmer > 1:
+                    kmers.append(current_kmer)
+                    current_kmer = 1
+        else:
+            if current_kmer > 1:
+                kmers.append(current_kmer)
+    return kmers
+
+def get_bool_from_value(value) -> bool:
+    if isinstance(value, bool):
+        return value
+    elif isinstance(value, int):
+        if value in [0, 1]:
+            return bool(value)
+        else:
+            raise ValueError(f"Cannot parse bool value from the given input {value}.")
+    elif isinstance(value, str):
+        return "t" in value.lower()
+    else:
+        raise ValueError(f"Cannot parse bool value from the given input {value}.")
+    return False
+
 def get_key_spec_lib(row: pd.Series) -> str:
     # ModifiedPeptide
     # DAKQRIVDK_NGVKM[Oxidation]C[Carbamidomethyl]PR
@@ -111,6 +143,7 @@ def generate_fragment_index(spectronaut: pd.DataFrame, index: dict) -> Dict[str,
             fragment_annotation[key] = {"matched_number_ions_a": 0,
                                         "matched_number_ions_b": 0,
                                         "fragments": list(),
+                                        "fragments_rows": list(),
                                         "ion_types": set()}
         # current fragment ion from spectronaut row
         ion = float(row[SPECTRONAUT_FRAGMENT_MZ_COLUMN_NAME])
@@ -136,6 +169,7 @@ def generate_fragment_index(spectronaut: pd.DataFrame, index: dict) -> Dict[str,
                         if fragment_key not in fragment_annotation[key]["fragments"]:
                             fragment_annotation[key]["matched_number_ions_a"] += 1
                             fragment_annotation[key]["fragments"].append(fragment_key)
+                            fragment_annotation[key]["fragments_rows"].append(current_ion)
                             fragment_annotation[key]["ion_types"].add(
                                 f"{current_ion['FragmentType']};{current_ion['FragmentNumber']};0"
                             )
@@ -145,6 +179,7 @@ def generate_fragment_index(spectronaut: pd.DataFrame, index: dict) -> Dict[str,
                         if fragment_key not in fragment_annotation[key]["fragments"]:
                             fragment_annotation[key]["matched_number_ions_b"] += 1
                             fragment_annotation[key]["fragments"].append(fragment_key)
+                            fragment_annotation[key]["fragments_rows"].append(current_ion)
                             fragment_annotation[key]["ion_types"].add(
                                 f"{current_ion['FragmentType']};{current_ion['FragmentNumber']};1"
                             )
@@ -485,6 +520,74 @@ def annotate_SequenceCoverage(row: pd.Series, fragment_annotation: dict, alpha:
     tqdm.pandas(desc = "Annotating sequence coverage for full crosslink...")
     spectronaut["PP.SequenceCoverageFull"] = spectronaut.progress_apply(lambda row: (float(row["PP.SequenceCoverageAlpha"]) + float(row["PP.SequenceCoverageBeta"])) / 2.0, axis = 1)
 
+    def annotate_UniScore(row: pd.Series, fragment_annotation: dict, alpha: bool) -> float:
+        key = get_key_spectronaut(row)
+        ion_types = fragment_annotation[key]["ion_types"]
+        peptide = str(row["PP.PeptideA"]).strip() if alpha else str(row["PP.PeptideB"]).strip()
+        pep_id_lookup = 0 if alpha else 1
+        nr_of_matched_ions = 0
+        unique_seq_positions = set()
+        for ion in ion_types:
+            pep_id = int(ion.split(";")[2])
+            ion_type = str(ion.split(";")[0]).strip()
+            ion_number = int(ion.split(";")[1])
+            if len(ion_type) != 1:
+                raise RuntimeError(f"Could not parse ion type from ion {ion}!")
+            if pep_id == pep_id_lookup:
+                if ion_type in ["a", "b", "c"]:
+                    unique_seq_positions.add(ion_number)
+                    nr_of_matched_ions += 1
+                elif ion_type in ["x", "y", "z"]:
+                    unique_seq_positions.add(len(peptide) + 1 - ion_number)
+                    nr_of_matched_ions += 1
+                else:
+                    raise RuntimeError(f"Found not-suppored ion type: {ion_type}")
+        kmers = get_kmers(unique_seq_positions)
+        return nr_of_matched_ions + sum(kmers)
+
+    tqdm.pandas(desc = "Annotating UniScore for alpha peptide...")
+    spectronaut["PP.UniScoreAlpha"] = spectronaut.progress_apply(lambda row: annotate_UniScore(row, fragment_annotation, True), axis = 1)
+
+    tqdm.pandas(desc = "Annotating UniScore for beta peptide...")
+    spectronaut["PP.UniScoreBeta"] = spectronaut.progress_apply(lambda row: annotate_UniScore(row, fragment_annotation, False), axis = 1)
+
+    tqdm.pandas(desc = "Annotating UniScore for full crosslinks...")
+    spectronaut["PP.UniScoreFull"] = spectronaut.progress_apply(lambda row: min(float(row["PP.UniScoreAlpha"]), float(row["PP.UniScoreBeta"])), axis = 1)
+
+    tqdm.pandas(desc = "Annotating peptide length for alpha peptide...")
+    spectronaut["PP.PepLenAlpha"] = spectronaut.progress_apply(lambda row: len(str(row["PP.PeptideA"]).strip()), axis = 1)
+
+    tqdm.pandas(desc = "Annotating peptide length for beta peptide...")
+    spectronaut["PP.PepLenBeta"] = spectronaut.progress_apply(lambda row: len(str(row["PP.PeptideB"]).strip()), axis = 1)
+
+    def annotate_CrosslinkFragments(row: pd.Series, fragment_annotation: dict, alpha: bool) -> int:
+        key = get_key_spectronaut(row)
+        ions_as_full_spec_lib_rows = fragment_annotation[key]["fragments_rows"]
+        pep_id_lookup = 0 if alpha else 1
+        nr_of_crosslink_fragments = 0
+        for ion in ions_as_full_spec_lib_rows:
+            if ion["FragmentPepId"] == pep_id_lookup and get_bool_from_value(ion["CLContainingFragment"]):
+                nr_of_crosslink_fragments += 1
+        return nr_of_crosslink_fragments
+
+    tqdm.pandas(desc = "Annotating number of crosslink fragments for alpha peptide...")
+    spectronaut["PP.NumberCrosslinkFragmentsAlpha"] = spectronaut.progress_apply(lambda row: annotate_CrosslinkFragments(row, fragment_annotation, True), axis = 1)
+
+    tqdm.pandas(desc = "Annotating number of crosslink fragments for beta peptide...")
+    spectronaut["PP.NumberCrosslinkFragmentsBeta"] = spectronaut.progress_apply(lambda row: annotate_CrosslinkFragments(row, fragment_annotation, False), axis = 1)
+
+    tqdm.pandas(desc = "Annotating number of crosslink fragments for full crosslinks...")
+    spectronaut["PP.NumberCrosslinkFragmentsFull"] = spectronaut.progress_apply(lambda row: row["PP.NumberCrosslinkFragmentsAlpha"] + row["PP.NumberCrosslinkFragmentsBeta"], axis = 1)
+
+    tqdm.pandas(desc = "Annotating number of crosslink fragments (normalized) for alpha peptide...")
+    spectronaut["PP.NormalizedCrosslinkFragmentsAlpha"] = spectronaut.progress_apply(lambda row: row["PP.NumberCrosslinkFragmentsAlpha"] / row["PP.TotalIonsA"], axis = 1)
+
+    tqdm.pandas(desc = "Annotating number of crosslink fragments (normalized) for beta peptide...")
+    spectronaut["PP.NormalizedCrosslinkFragmentsBeta"] = spectronaut.progress_apply(lambda row: row["PP.NumberCrosslinkFragmentsBeta"] / row["PP.TotalIonsB"], axis = 1)
+
+    tqdm.pandas(desc = "Annotating number of crosslink fragments (normalized) for full crosslinks...")
+    spectronaut["PP.NormalizedCrosslinkFragmentsFull"] = spectronaut.progress_apply(lambda row: (row["PP.NumberCrosslinkFragmentsAlpha"] + row["PP.NumberCrosslinkFragmentsBeta"]) / (row["PP.TotalIonsA"] + row["PP.TotalIonsB"]), axis = 1)
+
     spectronaut["PP.PseudoScanNumber"] = pd.Series(range(spectronaut.shape[0]))
     spectronaut["PP.Crosslinker"] = pd.Series([CROSSLINKER for i in range(spectronaut.shape[0])])
     spectronaut["PP.CrosslinkerMass"] = pd.Series([CROSSLINKER_MASS for i in range(spectronaut.shape[0])])
diff --git a/tests/tests.py b/tests/tests.py
@@ -352,3 +352,19 @@ def test12_spectral_library_exporter():
             assert float(row["Combined Score"]) == pytest.approx(8.71)
             checked += 1
     assert checked == 2
+
+# check kmers calculation
+def test13_test_kmers():
+
+    from post_process import get_kmers
+
+    unique_seq_positions = {1,2,3,7,8,11,10,15,16,17,18}
+    assert get_kmers(unique_seq_positions) == [3,2,2,4]
+    unique_seq_positions = {1,3,5}
+    assert get_kmers(unique_seq_positions) == []
+    unique_seq_positions = {0,1}
+    assert get_kmers(unique_seq_positions) == [2]
+    unique_seq_positions = {0,1,3,7,9}
+    assert get_kmers(unique_seq_positions) == [2]
+    unique_seq_positions = {0,1,3,7,8,9,15}
+    assert get_kmers(unique_seq_positions) == [2,3]