RosettaCommons · roccomoretti · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/docs/rf3/examples/2FLD_from_fasta.fasta b/docs/rf3/examples/2FLD_from_fasta.fasta
@@ -0,0 +1,14 @@
+>2FLD_1|Chain A[auth C]|5'-D(*GP*CP*AP*GP*AP*AP*GP*GP*TP*CP*GP*TP*GP*AP*GP*AP*CP*CP*GP*TP*TP*CP*CP*G)-3'|
+GCAGAAGGTCGTGAGACCGTTCCG
+>B|dna
+CGGAACGGTCTCACGACCTTCTGC
+>2FLD_3|Chains C[auth A], D[auth B]|DNA ENDONUCLEASE I-MSOI|Monomastix sp. (141716)|docs/rf3/examples/msas/2FLD_CD.a3m.gz
+TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
+>D|protein|docs/rf3/examples/msas/2FLD_CD.a3m.gz
+TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
+>E|ccd
+NA
+>ccd
+CA
+>smiles
+[Ca+2]
diff --git a/docs/rf3/examples/msas/2FLD_CD.a3m.gz b/docs/rf3/examples/msas/2FLD_CD.a3m.gz
diff --git a/src/modelhub/inference_engines/README.md b/src/modelhub/inference_engines/README.md
@@ -443,6 +443,56 @@ RF3 uses AtomWorks' flexible `AtomSelectionStack` query syntax for specifying st
 | All backbone atoms in chain A | `A/*/*/N, A/*/*/CA, A/*/*/C, A/*/*/O` |
 </details>
 
+<details>
+<summary><strong>Folding from a FASTA file</strong></summary>
+
+For convienience, RF3 supports input from FASTA files.
+Not all input features are supported, and for more complex use cases (e.g. covalent connectivity), the use of JSON or CIF input formats is recommended.
+
+The input format is (roughly) compatible with the Boltz FASTA format:
+```
+>CHAIN_ID|ENTITY_TYPE|MSA_PATH
+SEQUENCE
+```
+Where ENTITY_TYPE is one of `protein`, `dna`, `rna`, `ccd`, `smiles` or `path`.
+
+- Each FASTA represents a single combined prediction.
+- The name of the output files take their name from the name of the fasta file.
+- All field are optional. If ENTITY_TYPE is not present, it defaults to polymeric (protein/dna/rna).
+- Each entry type is handled in the same way a their corresponding entry in the JSON-style input. Including support for inline modified residues with the `(PBF)`-style CCD code designation syntax.
+- If present, CHAIN_ID must be a single character.
+- If present, MSA_PATH must include ".a3m" in its name.
+
+
+📝 **Example FASTA configuration** (full example found at `docs/rf3/examples/2FLD_from_fasta.fasta`):
+
+```
+>2FLD_1|Chain A[auth C]|5'-D(*GP*CP*AP*GP*AP*AP*GP*GP*TP*CP*GP*TP*GP*AP*GP*AP*CP*CP*GP*TP*TP*CP*CP*G)-3'|
+GCAGAAGGTCGTGAGACCGTTCCG
+>B|dna
+CGGAACGGTCTCACGACCTTCTGC
+>2FLD_3|Chains C[auth A], D[auth B]|DNA ENDONUCLEASE I-MSOI|Monomastix sp. (141716)|docs/rf3/examples/msas/2FLD_CD.a3m.gz
+TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
+>D|protein|docs/rf3/examples/msas/2FLD_CD.a3m.gz
+TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
+>E|ccd
+NA
+>ccd
+CA
+>smiles
+[Ca+2]
+```
+
+Note that chain A has the identical header as what comes from the RCSB-provided FASTA file, and chain C has the RCSB-provided header plus the appended MSA file path. The intent is that purely polymeric predictions should work with minimal pre-processing of header lines.
+
+🚀 **Run the example:**
+
+```bash
+rf3 fold inputs='docs/rf3/examples/2FLD_from_fasta.fasta'
+```
+
+</details>
+
 
 #### Templating a Polymer (Protein / DNA / RNA)
 

diff --git a/src/modelhub/utils/inference.py b/src/modelhub/utils/inference.py
@@ -19,8 +19,10 @@
 from modelhub.utils.io import (
     CIF_LIKE_EXTENSIONS,
     DICTIONARY_LIKE_EXTENSIONS,
+    SEQUENCE_LIKE_EXTENSIONS,
     create_example_id_extractor,
     find_files_with_extension,
+    parse_generalized_fasta,
 )
 
 
@@ -87,6 +89,66 @@ def _spoof_cif_from_dictionary(item: dict, temp_dir: PathLike) -> Path:
 
     return Path(save_path)
 
+def _spoof_cif_from_sequences(sequencepath: PathLike, temp_dir: PathLike) -> Path:
+    """Parses and unpacks a sequence file to create a CIF file from its components.
+
+    Creates only one CIF file per sequence file (potentially containing multiple sequences).
+
+    The label of the sequence (roughly) follows the Boltz convention
+
+    >CHAIN_ID|ENTITY_TYPE|MSA_PATH
+
+    Where ENTITY_TYPE is in [protein, dna, rna, smiles, ccd, path] (last is non-Boltz)
+
+    However, both the CHAIN_ID and MSA_PATH are optional (if present the latter must have "a3m" in the name).
+    If the header does not follow the format, then it's assumed that it's a polymeric (protein) sequence
+
+    Args:
+        sequencepath (Path): The path to a sequence file.
+        temp_dir (Path): Path to the temporary directory for storing CIF files.
+
+    Returns:
+        Path: The path to the created CIF file, saved in the temporary directory.
+
+    """
+    seqs: list[ tuple[str, str] ] = parse_generalized_fasta(sequencepath);
+
+    components = []
+
+    for label, value in seqs:
+        entry = {}
+        cif_or_pdb_file = False
+
+        header_parts = label.split("|")
+        for hp in header_parts:
+            if "a3m" in hp:
+                entry["msa_path"] = hp
+                break
+        if "ccd" in header_parts:
+            entry["ccd_code"] = value
+        elif "path" in header_parts:
+            entry["path"] = value
+            if '.pdb' in value.lower() or '.cif' in value.lower():
+                cif_or_pdb_file = True
+        elif "smiles" in header_parts:
+            entry["smiles"] = value
+        elif "protein" in header_parts or "rna" in header_parts or "dna" in header_parts:
+            entry["seq"] = value
+        else:
+            logging.warning(f"Header for entry `{label}` in `{sequencepath}` omits an entity designation: assuming polymeric")
+            entry["seq"] = value
+
+        if len(header_parts) > 1 and len(header_parts[0]) == 1:
+            if cif_or_pdb_file:
+                logging.warning("Cannot reset chain_id for PDB or CIF in sequence file header -- chain letter is specified by structure file.")
+            else:
+                entry["chain_id"] = header_parts[0]
+
+        components.append(entry)
+
+    item = {"name":sequencepath.stem, "components":components}
+
+    return _spoof_cif_from_dictionary(item, temp_dir)
 
 def build_file_paths_for_prediction(
     input: PathLike | list[PathLike],
@@ -125,7 +187,7 @@ def build_file_paths_for_prediction(
         if Path(_path).is_dir():
             paths_to_raw_input_files.extend(
                 find_files_with_extension(
-                    _path, DICTIONARY_LIKE_EXTENSIONS | CIF_LIKE_EXTENSIONS
+                    _path, DICTIONARY_LIKE_EXTENSIONS | CIF_LIKE_EXTENSIONS | SEQUENCE_LIKE_EXTENSIONS
                 )
             )
         else:
@@ -156,6 +218,9 @@ def build_file_paths_for_prediction(
         elif _path.name.endswith(tuple(CIF_LIKE_EXTENSIONS)):
             # Directly use CIF-like files
             paths_to_cif_like_files.append(_path)
+        elif _path.name.endswith(tuple(SEQUENCE_LIKE_EXTENSIONS)):
+            # Spoof CIF files from sequence-like formats
+            paths_to_cif_like_files.append( _spoof_cif_from_sequences(_path, temp_dir) )
         else:
             raise ValueError(
                 f"Unsupported file extension: {_path.suffix} (path: {_path}; paths: {paths_to_raw_input_files})."

diff --git a/src/modelhub/utils/io.py b/src/modelhub/utils/io.py
@@ -15,6 +15,7 @@
 
 DICTIONARY_LIKE_EXTENSIONS = {".json", ".yaml", ".yml", ".pkl"}
 CIF_LIKE_EXTENSIONS = {".cif", ".pdb", ".bcif", ".cif.gz", ".pdb.gz", ".bcif.gz"}
+SEQUENCE_LIKE_EXTENSIONS = {".fas", ".fasta"}
 
 
 def build_stack_from_atom_array_and_batched_coords(
@@ -205,3 +206,24 @@ def extract_example_id_from_path(file_path: PathLike, extensions: set | list) ->
     """Extract example_id from file path with specified extensions."""
     extractor = create_example_id_extractor(extensions)
     return extractor(file_path)
+
+def parse_generalized_fasta(file_path: PathLike) -> list[ tuple[str, str] ]:
+    """A robust FASTA parser, where the sequence & label components can be arbitrary strings, not limited to a specific alphabet."""
+    parsed: list[ tuple[str, str] ] = []
+    current_header: str | None = None
+    current_body: list[str] = []
+    with open(file_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if len(line) == 0: continue
+            if line[0] == ">":
+                if current_header is not None:
+                    parsed.append( (current_header, ''.join(current_body) ) )
+                current_header = line[1:]
+                current_body = []
+            else:
+                current_body.append(line)
+    if current_header is not None:
+        parsed.append( (current_header, ''.join(current_body) ) )
+
+    return parsed
diff --git a/tests/data/5vht_from_fasta.fasta b/tests/data/5vht_from_fasta.fasta
@@ -0,0 +1,8 @@
+>Arbitrary header|with|various|multiple|bars
+MTSENPLLALREKISALDEKLLALFAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAH(PBF)ITRTFQLGIEYSVLTQQALLEHHHHHH
+>B|protein|tests/data/msas/5vht_A.a3m
+MTSENPLLALREKISALDEKLLALFAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAH(PBF)ITRTFQLGIEYSVLTQQALLEHHHHHH
+>ccd
+MG
+>D|smiles
+OCCO
diff --git a/tests/test_inference_pipelines.py b/tests/test_inference_pipelines.py
@@ -21,6 +21,7 @@
     [
         "data/nested_examples",
         "data/multiple_examples_from_json.json",
+        "data/5vht_from_fasta.fasta",
     ],
 )
 def test_build_file_paths_for_prediction(file_path: PathLike, tmp_path: Path):