Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/rf3/examples/2FLD_from_fasta.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>2FLD_1|Chain A[auth C]|5'-D(*GP*CP*AP*GP*AP*AP*GP*GP*TP*CP*GP*TP*GP*AP*GP*AP*CP*CP*GP*TP*TP*CP*CP*G)-3'|
GCAGAAGGTCGTGAGACCGTTCCG
>B|dna
CGGAACGGTCTCACGACCTTCTGC
>2FLD_3|Chains C[auth A], D[auth B]|DNA ENDONUCLEASE I-MSOI|Monomastix sp. (141716)|docs/rf3/examples/msas/2FLD_CD.a3m.gz
TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
>D|protein|docs/rf3/examples/msas/2FLD_CD.a3m.gz
TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
>E|ccd
NA
>ccd
CA
>smiles
[Ca+2]
Binary file added docs/rf3/examples/msas/2FLD_CD.a3m.gz
Binary file not shown.
50 changes: 50 additions & 0 deletions src/modelhub/inference_engines/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,56 @@ RF3 uses AtomWorks' flexible `AtomSelectionStack` query syntax for specifying st
| All backbone atoms in chain A | `A/*/*/N, A/*/*/CA, A/*/*/C, A/*/*/O` |
</details>

<details>
<summary><strong>Folding from a FASTA file</strong></summary>

For convienience, RF3 supports input from FASTA files.
Not all input features are supported, and for more complex use cases (e.g. covalent connectivity), the use of JSON or CIF input formats is recommended.

The input format is (roughly) compatible with the Boltz FASTA format:
```
>CHAIN_ID|ENTITY_TYPE|MSA_PATH
SEQUENCE
```
Where ENTITY_TYPE is one of `protein`, `dna`, `rna`, `ccd`, `smiles` or `path`.

- Each FASTA represents a single combined prediction.
- The name of the output files take their name from the name of the fasta file.
- All field are optional. If ENTITY_TYPE is not present, it defaults to polymeric (protein/dna/rna).
- Each entry type is handled in the same way a their corresponding entry in the JSON-style input. Including support for inline modified residues with the `(PBF)`-style CCD code designation syntax.
- If present, CHAIN_ID must be a single character.
- If present, MSA_PATH must include ".a3m" in its name.


📝 **Example FASTA configuration** (full example found at `docs/rf3/examples/2FLD_from_fasta.fasta`):

```
>2FLD_1|Chain A[auth C]|5'-D(*GP*CP*AP*GP*AP*AP*GP*GP*TP*CP*GP*TP*GP*AP*GP*AP*CP*CP*GP*TP*TP*CP*CP*G)-3'|
GCAGAAGGTCGTGAGACCGTTCCG
>B|dna
CGGAACGGTCTCACGACCTTCTGC
>2FLD_3|Chains C[auth A], D[auth B]|DNA ENDONUCLEASE I-MSOI|Monomastix sp. (141716)|docs/rf3/examples/msas/2FLD_CD.a3m.gz
TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
>D|protein|docs/rf3/examples/msas/2FLD_CD.a3m.gz
TLQPTEAAYIAGFLDGDGSIYALLIPRPDYKDIKYQVSLAISFIQRKDKFPYLQDIYDQLGKRGNLRKDRGDGIADYRIIGSTHLSIILPDLVPYLRIKKKQANRILHIINLYPQAQKNPSKFLDLVKIVDDVQNLNKRADELKSTNYDRLLEEFLKAGKIESSP
>E|ccd
NA
>ccd
CA
>smiles
[Ca+2]
```

Note that chain A has the identical header as what comes from the RCSB-provided FASTA file, and chain C has the RCSB-provided header plus the appended MSA file path. The intent is that purely polymeric predictions should work with minimal pre-processing of header lines.

🚀 **Run the example:**

```bash
rf3 fold inputs='docs/rf3/examples/2FLD_from_fasta.fasta'
```

</details>


#### Templating a Polymer (Protein / DNA / RNA)

Expand Down
67 changes: 66 additions & 1 deletion src/modelhub/utils/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from modelhub.utils.io import (
CIF_LIKE_EXTENSIONS,
DICTIONARY_LIKE_EXTENSIONS,
SEQUENCE_LIKE_EXTENSIONS,
create_example_id_extractor,
find_files_with_extension,
parse_generalized_fasta,
)


Expand Down Expand Up @@ -87,6 +89,66 @@ def _spoof_cif_from_dictionary(item: dict, temp_dir: PathLike) -> Path:

return Path(save_path)

def _spoof_cif_from_sequences(sequencepath: PathLike, temp_dir: PathLike) -> Path:
"""Parses and unpacks a sequence file to create a CIF file from its components.

Creates only one CIF file per sequence file (potentially containing multiple sequences).

The label of the sequence (roughly) follows the Boltz convention

>CHAIN_ID|ENTITY_TYPE|MSA_PATH

Where ENTITY_TYPE is in [protein, dna, rna, smiles, ccd, path] (last is non-Boltz)

However, both the CHAIN_ID and MSA_PATH are optional (if present the latter must have "a3m" in the name).
If the header does not follow the format, then it's assumed that it's a polymeric (protein) sequence

Args:
sequencepath (Path): The path to a sequence file.
temp_dir (Path): Path to the temporary directory for storing CIF files.

Returns:
Path: The path to the created CIF file, saved in the temporary directory.

"""
seqs: list[ tuple[str, str] ] = parse_generalized_fasta(sequencepath);

components = []

for label, value in seqs:
entry = {}
cif_or_pdb_file = False

header_parts = label.split("|")
for hp in header_parts:
if "a3m" in hp:
entry["msa_path"] = hp
break
if "ccd" in header_parts:
entry["ccd_code"] = value
elif "path" in header_parts:
entry["path"] = value
if '.pdb' in value.lower() or '.cif' in value.lower():
cif_or_pdb_file = True
elif "smiles" in header_parts:
entry["smiles"] = value
elif "protein" in header_parts or "rna" in header_parts or "dna" in header_parts:
entry["seq"] = value
else:
logging.warning(f"Header for entry `{label}` in `{sequencepath}` omits an entity designation: assuming polymeric")
entry["seq"] = value

if len(header_parts) > 1 and len(header_parts[0]) == 1:
if cif_or_pdb_file:
logging.warning("Cannot reset chain_id for PDB or CIF in sequence file header -- chain letter is specified by structure file.")
else:
entry["chain_id"] = header_parts[0]

components.append(entry)

item = {"name":sequencepath.stem, "components":components}

return _spoof_cif_from_dictionary(item, temp_dir)

def build_file_paths_for_prediction(
input: PathLike | list[PathLike],
Expand Down Expand Up @@ -125,7 +187,7 @@ def build_file_paths_for_prediction(
if Path(_path).is_dir():
paths_to_raw_input_files.extend(
find_files_with_extension(
_path, DICTIONARY_LIKE_EXTENSIONS | CIF_LIKE_EXTENSIONS
_path, DICTIONARY_LIKE_EXTENSIONS | CIF_LIKE_EXTENSIONS | SEQUENCE_LIKE_EXTENSIONS
)
)
else:
Expand Down Expand Up @@ -156,6 +218,9 @@ def build_file_paths_for_prediction(
elif _path.name.endswith(tuple(CIF_LIKE_EXTENSIONS)):
# Directly use CIF-like files
paths_to_cif_like_files.append(_path)
elif _path.name.endswith(tuple(SEQUENCE_LIKE_EXTENSIONS)):
# Spoof CIF files from sequence-like formats
paths_to_cif_like_files.append( _spoof_cif_from_sequences(_path, temp_dir) )
else:
raise ValueError(
f"Unsupported file extension: {_path.suffix} (path: {_path}; paths: {paths_to_raw_input_files})."
Expand Down
22 changes: 22 additions & 0 deletions src/modelhub/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

DICTIONARY_LIKE_EXTENSIONS = {".json", ".yaml", ".yml", ".pkl"}
CIF_LIKE_EXTENSIONS = {".cif", ".pdb", ".bcif", ".cif.gz", ".pdb.gz", ".bcif.gz"}
SEQUENCE_LIKE_EXTENSIONS = {".fas", ".fasta"}


def build_stack_from_atom_array_and_batched_coords(
Expand Down Expand Up @@ -205,3 +206,24 @@ def extract_example_id_from_path(file_path: PathLike, extensions: set | list) ->
"""Extract example_id from file path with specified extensions."""
extractor = create_example_id_extractor(extensions)
return extractor(file_path)

def parse_generalized_fasta(file_path: PathLike) -> list[ tuple[str, str] ]:
"""A robust FASTA parser, where the sequence & label components can be arbitrary strings, not limited to a specific alphabet."""
parsed: list[ tuple[str, str] ] = []
current_header: str | None = None
current_body: list[str] = []
with open(file_path, "r") as f:
for line in f:
line = line.strip()
if len(line) == 0: continue
if line[0] == ">":
if current_header is not None:
parsed.append( (current_header, ''.join(current_body) ) )
current_header = line[1:]
current_body = []
else:
current_body.append(line)
if current_header is not None:
parsed.append( (current_header, ''.join(current_body) ) )

return parsed
8 changes: 8 additions & 0 deletions tests/data/5vht_from_fasta.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>Arbitrary header|with|various|multiple|bars
MTSENPLLALREKISALDEKLLALFAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAH(PBF)ITRTFQLGIEYSVLTQQALLEHHHHHH
>B|protein|tests/data/msas/5vht_A.a3m
MTSENPLLALREKISALDEKLLALFAERRELAVEVGKAKLLSHRPVRDIDRERDLLERLITLGKAHHLDAH(PBF)ITRTFQLGIEYSVLTQQALLEHHHHHH
>ccd
MG
>D|smiles
OCCO
1 change: 1 addition & 0 deletions tests/test_inference_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
[
"data/nested_examples",
"data/multiple_examples_from_json.json",
"data/5vht_from_fasta.fasta",
],
)
def test_build_file_paths_for_prediction(file_path: PathLike, tmp_path: Path):
Expand Down