diff --git a/docs/rf3/examples/2FLD_from_fasta.fasta b/docs/rf3/examples/2FLD_from_fasta.fasta new file mode 100644 index 0000000..ebe7707 --- /dev/null +++ b/docs/rf3/examples/2FLD_from_fasta.fasta @@ -0,0 +1,14 @@ +>2FLD_1|Chain A[auth C]|5'-D(*GP*CP*AP*GP*AP*AP*GP*GP*TP*CP*GP*TP*GP*AP*GP*AP*CP*CP*GP*TP*TP*CP*CP*G)-3'| +GCAGAAGGTCGTGAGACCGTTCCG +>B|dna +CGGAACGGTCTCACGACCTTCTGC +>2FLD_3|Chains C[auth A], D[auth B]|DNA ENDONUCLEASE I-MSOI|Monomastix sp. (141716)|docs/rf3/examples/msas/2FLD_CD.a3m.gz +TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP +>D|protein|docs/rf3/examples/msas/2FLD_CD.a3m.gz +TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP +>E|ccd +NA +>ccd +CA +>smiles +[Ca+2] diff --git a/docs/rf3/examples/msas/2FLD_CD.a3m.gz b/docs/rf3/examples/msas/2FLD_CD.a3m.gz new file mode 100644 index 0000000..f9d5823 Binary files /dev/null and b/docs/rf3/examples/msas/2FLD_CD.a3m.gz differ diff --git a/src/modelhub/inference_engines/README.md b/src/modelhub/inference_engines/README.md index b0f4f37..da2d06d 100644 --- a/src/modelhub/inference_engines/README.md +++ b/src/modelhub/inference_engines/README.md @@ -443,6 +443,56 @@ RF3 uses AtomWorks' flexible `AtomSelectionStack` query syntax for specifying st | All backbone atoms in chain A | `A/*/*/N, A/*/*/CA, A/*/*/C, A/*/*/O` | +
+Folding from a FASTA file + +For convienience, RF3 supports input from FASTA files. +Not all input features are supported, and for more complex use cases (e.g. covalent connectivity), the use of JSON or CIF input formats is recommended. + +The input format is (roughly) compatible with the Boltz FASTA format: +``` +>CHAIN_ID|ENTITY_TYPE|MSA_PATH +SEQUENCE +``` +Where ENTITY_TYPE is one of `protein`, `dna`, `rna`, `ccd`, `smiles` or `path`. + +- Each FASTA represents a single combined prediction. +- The name of the output files take their name from the name of the fasta file. +- All field are optional. If ENTITY_TYPE is not present, it defaults to polymeric (protein/dna/rna). +- Each entry type is handled in the same way a their corresponding entry in the JSON-style input. Including support for inline modified residues with the `(PBF)`-style CCD code designation syntax. +- If present, CHAIN_ID must be a single character. +- If present, MSA_PATH must include ".a3m" in its name. + + +📝 **Example FASTA configuration** (full example found at `docs/rf3/examples/2FLD_from_fasta.fasta`): + +``` +>2FLD_1|Chain A[auth C]|5'-D(*GP*CP*AP*GP*AP*AP*GP*GP*TP*CP*GP*TP*GP*AP*GP*AP*CP*CP*GP*TP*TP*CP*CP*G)-3'| +GCAGAAGGTCGTGAGACCGTTCCG +>B|dna +CGGAACGGTCTCACGACCTTCTGC +>2FLD_3|Chains C[auth A], D[auth B]|DNA ENDONUCLEASE I-MSOI|Monomastix sp. (141716)|docs/rf3/examples/msas/2FLD_CD.a3m.gz +TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP +>D|protein|docs/rf3/examples/msas/2FLD_CD.a3m.gz +TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP +>E|ccd +NA +>ccd +CA +>smiles +[Ca+2] +``` + +Note that chain A has the identical header as what comes from the RCSB-provided FASTA file, and chain C has the RCSB-provided header plus the appended MSA file path. The intent is that purely polymeric predictions should work with minimal pre-processing of header lines. + +🚀 **Run the example:** + +```bash +rf3 fold inputs='docs/rf3/examples/2FLD_from_fasta.fasta' +``` + +
+ #### Templating a Polymer (Protein / DNA / RNA) diff --git a/src/modelhub/utils/inference.py b/src/modelhub/utils/inference.py index dc7b9fa..51f8c5d 100644 --- a/src/modelhub/utils/inference.py +++ b/src/modelhub/utils/inference.py @@ -19,8 +19,10 @@ from modelhub.utils.io import ( CIF_LIKE_EXTENSIONS, DICTIONARY_LIKE_EXTENSIONS, + SEQUENCE_LIKE_EXTENSIONS, create_example_id_extractor, find_files_with_extension, + parse_generalized_fasta, ) @@ -87,6 +89,66 @@ def _spoof_cif_from_dictionary(item: dict, temp_dir: PathLike) -> Path: return Path(save_path) +def _spoof_cif_from_sequences(sequencepath: PathLike, temp_dir: PathLike) -> Path: + """Parses and unpacks a sequence file to create a CIF file from its components. + + Creates only one CIF file per sequence file (potentially containing multiple sequences). + + The label of the sequence (roughly) follows the Boltz convention + + >CHAIN_ID|ENTITY_TYPE|MSA_PATH + + Where ENTITY_TYPE is in [protein, dna, rna, smiles, ccd, path] (last is non-Boltz) + + However, both the CHAIN_ID and MSA_PATH are optional (if present the latter must have "a3m" in the name). + If the header does not follow the format, then it's assumed that it's a polymeric (protein) sequence + + Args: + sequencepath (Path): The path to a sequence file. + temp_dir (Path): Path to the temporary directory for storing CIF files. + + Returns: + Path: The path to the created CIF file, saved in the temporary directory. + + """ + seqs: list[ tuple[str, str] ] = parse_generalized_fasta(sequencepath); + + components = [] + + for label, value in seqs: + entry = {} + cif_or_pdb_file = False + + header_parts = label.split("|") + for hp in header_parts: + if "a3m" in hp: + entry["msa_path"] = hp + break + if "ccd" in header_parts: + entry["ccd_code"] = value + elif "path" in header_parts: + entry["path"] = value + if '.pdb' in value.lower() or '.cif' in value.lower(): + cif_or_pdb_file = True + elif "smiles" in header_parts: + entry["smiles"] = value + elif "protein" in header_parts or "rna" in header_parts or "dna" in header_parts: + entry["seq"] = value + else: + logging.warning(f"Header for entry `{label}` in `{sequencepath}` omits an entity designation: assuming polymeric") + entry["seq"] = value + + if len(header_parts) > 1 and len(header_parts[0]) == 1: + if cif_or_pdb_file: + logging.warning("Cannot reset chain_id for PDB or CIF in sequence file header -- chain letter is specified by structure file.") + else: + entry["chain_id"] = header_parts[0] + + components.append(entry) + + item = {"name":sequencepath.stem, "components":components} + + return _spoof_cif_from_dictionary(item, temp_dir) def build_file_paths_for_prediction( input: PathLike | list[PathLike], @@ -125,7 +187,7 @@ def build_file_paths_for_prediction( if Path(_path).is_dir(): paths_to_raw_input_files.extend( find_files_with_extension( - _path, DICTIONARY_LIKE_EXTENSIONS | CIF_LIKE_EXTENSIONS + _path, DICTIONARY_LIKE_EXTENSIONS | CIF_LIKE_EXTENSIONS | SEQUENCE_LIKE_EXTENSIONS ) ) else: @@ -156,6 +218,9 @@ def build_file_paths_for_prediction( elif _path.name.endswith(tuple(CIF_LIKE_EXTENSIONS)): # Directly use CIF-like files paths_to_cif_like_files.append(_path) + elif _path.name.endswith(tuple(SEQUENCE_LIKE_EXTENSIONS)): + # Spoof CIF files from sequence-like formats + paths_to_cif_like_files.append( _spoof_cif_from_sequences(_path, temp_dir) ) else: raise ValueError( f"Unsupported file extension: {_path.suffix} (path: {_path}; paths: {paths_to_raw_input_files})." diff --git a/src/modelhub/utils/io.py b/src/modelhub/utils/io.py index d0e219b..57c7304 100644 --- a/src/modelhub/utils/io.py +++ b/src/modelhub/utils/io.py @@ -15,6 +15,7 @@ DICTIONARY_LIKE_EXTENSIONS = {".json", ".yaml", ".yml", ".pkl"} CIF_LIKE_EXTENSIONS = {".cif", ".pdb", ".bcif", ".cif.gz", ".pdb.gz", ".bcif.gz"} +SEQUENCE_LIKE_EXTENSIONS = {".fas", ".fasta"} def build_stack_from_atom_array_and_batched_coords( @@ -205,3 +206,24 @@ def extract_example_id_from_path(file_path: PathLike, extensions: set | list) -> """Extract example_id from file path with specified extensions.""" extractor = create_example_id_extractor(extensions) return extractor(file_path) + +def parse_generalized_fasta(file_path: PathLike) -> list[ tuple[str, str] ]: + """A robust FASTA parser, where the sequence & label components can be arbitrary strings, not limited to a specific alphabet.""" + parsed: list[ tuple[str, str] ] = [] + current_header: str | None = None + current_body: list[str] = [] + with open(file_path, "r") as f: + for line in f: + line = line.strip() + if len(line) == 0: continue + if line[0] == ">": + if current_header is not None: + parsed.append( (current_header, ''.join(current_body) ) ) + current_header = line[1:] + current_body = [] + else: + current_body.append(line) + if current_header is not None: + parsed.append( (current_header, ''.join(current_body) ) ) + + return parsed diff --git a/tests/data/5vht_from_fasta.fasta b/tests/data/5vht_from_fasta.fasta new file mode 100644 index 0000000..75bbd91 --- /dev/null +++ b/tests/data/5vht_from_fasta.fasta @@ -0,0 +1,8 @@ +>Arbitrary header|with|various|multiple|bars +MTSENPLLALREKISALDEKLLALFAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAH(PBF)ITRTFQLGIEYSVLTQQALLEHHHHHH +>B|protein|tests/data/msas/5vht_A.a3m +MTSENPLLALREKISALDEKLLALFAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAH(PBF)ITRTFQLGIEYSVLTQQALLEHHHHHH +>ccd +MG +>D|smiles +OCCO diff --git a/tests/test_inference_pipelines.py b/tests/test_inference_pipelines.py index e79130b..9cb342d 100644 --- a/tests/test_inference_pipelines.py +++ b/tests/test_inference_pipelines.py @@ -21,6 +21,7 @@ [ "data/nested_examples", "data/multiple_examples_from_json.json", + "data/5vht_from_fasta.fasta", ], ) def test_build_file_paths_for_prediction(file_path: PathLike, tmp_path: Path):