From 129b4add04495c719937fcdf73f3cd0ab744bb50 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 29 Sep 2025 22:05:37 +0300 Subject: [PATCH 1/3] Enable multiprocessing --- arc/scheduler.py | 116 ++++++++++++++++---------- arc/scheduler_test.py | 112 +++++++++++++++++++++++++ arc/species/conformers.py | 115 ++++++++++++++++++++++++++ benchmark.py | 169 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 469 insertions(+), 43 deletions(-) create mode 100644 benchmark.py diff --git a/arc/scheduler.py b/arc/scheduler.py index f53ee8eead..7b69b4e108 100644 --- a/arc/scheduler.py +++ b/arc/scheduler.py @@ -12,6 +12,7 @@ import numpy as np from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from concurrent.futures import ThreadPoolExecutor, as_completed import arc.parser.parser as parser from arc import plotter @@ -131,6 +132,7 @@ class Scheduler(object): species_list (list): Contains input :ref:`ARCSpecies ` objects (both wells and TSs). rxn_list (list): Contains input :ref:`ARCReaction ` objects. project_directory (str): Folder path for the project: the input file path or ARC/Projects/project-name. + conformer_gen_nprocs (int, optional): The number of processes to use for non-TS conformer generation. Defaults to 1 (serial). composite_method (str, optional): A composite method to use. conformer_opt_level (Union[str, dict], optional): The level of theory to use for conformer comparisons. conformer_sp_level (Union[str, dict], optional): The level of theory to use for conformer sp jobs. @@ -171,6 +173,7 @@ class Scheduler(object): Attributes: project (str): The project's name. Used for naming the working directory. + conformer_gen_nprocs (int): The number of processes to use for non-TS conformer generation. servers (list): A list of servers used for the present project. species_list (list): Contains input :ref:`ARCSpecies ` objects (both species and TSs). species_dict (dict): Keys are labels, values are :ref:`ARCSpecies ` objects. @@ -232,6 +235,7 @@ def __init__(self, ess_settings: dict, species_list: list, project_directory: str, + conformer_gen_nprocs: Optional[int] = 1, composite_method: Optional[Level] = None, conformer_opt_level: Optional[Level] = None, conformer_sp_level: Optional[Level] = None, @@ -297,6 +301,7 @@ def __init__(self, self.output_multi_spc = dict() self.report_e_elect = report_e_elect self.skip_nmd = skip_nmd + self.conformer_gen_nprocs = int(conformer_gen_nprocs or default_job_settings.get('conformer_gen_nprocs', 1)) self.species_dict, self.rxn_dict = dict(), dict() for species in self.species_list: @@ -1040,6 +1045,26 @@ def end_job(self, job: 'JobAdapter', self.save_restart_dict() return True + def _generate_conformers_for_label(self, label: str) -> None: + """ + A helper function to generate conformers for a given species label (used internally). + + Args: + label (str): The species label. + """ + species = self.species_dict[label] + if species.force_field == 'cheap': + # Just embed in RDKit and use MMFF94s for opt and energies. + if species.initial_xyz is None: + species.initial_xyz = species.get_xyz() + else: + n_confs = self.n_confs if species.multi_species is None else 1 + species.generate_conformers( + n_confs=n_confs, + e_confs=self.e_confs, + plot_path=os.path.join(self.project_directory, 'output', 'Species', + label, 'geometry', 'conformers')) + def _run_a_job(self, job: 'JobAdapter', label: str, @@ -1088,52 +1113,57 @@ def run_conformer_jobs(self, labels: Optional[List[str]] = None): If ``None``, conformer jobs will be spawned for all species in self.species_list. """ labels_to_consider = labels if labels is not None else [spc.label for spc in self.species_list] - log_info_printed = False + pending_non_ts_generation: List[str] = [] + for label in labels_to_consider: - if not self.species_dict[label].is_ts and not self.output[label]['job_types']['opt'] \ - and 'opt' not in self.job_dict[label] and 'composite' not in self.job_dict[label] \ - and all([e is None for e in self.species_dict[label].conformer_energies]) \ - and self.species_dict[label].number_of_atoms > 1 and not self.output[label]['paths']['geo'] \ - and self.species_dict[label].yml_path is None and not self.output[label]['convergence'] \ - and (self.job_types['conf_opt'] and label not in self.dont_gen_confs - or self.species_dict[label].get_xyz(generate=False) is None): - # This is not a TS, opt (/composite) did not converge nor is it running, and conformer energies were - # not set. Also, either 'conf_opt' are set to True in job_types (and it's not in dont_gen_confs), - # or they are set to False (or it's in dont_gen_confs), but the species has no 3D information. - # Generate conformers. - if not log_info_printed: - logger.info('\nStarting (non-TS) species conformational analysis...\n') - log_info_printed = True - if self.species_dict[label].force_field == 'cheap': - # Just embed in RDKit and use MMFF94s for opt and energies. - if self.species_dict[label].initial_xyz is None: - self.species_dict[label].initial_xyz = self.species_dict[label].get_xyz() - else: - # Run the combinatorial method w/o fitting a force field. - n_confs = self.n_confs if self.species_dict[label].multi_species is None else 1 - self.species_dict[label].generate_conformers( - n_confs=n_confs, - e_confs=self.e_confs, - plot_path=os.path.join(self.project_directory, 'output', 'Species', - label, 'geometry', 'conformers')) - self.process_conformers(label) - # TSs: - elif self.species_dict[label].is_ts \ - and self.species_dict[label].tsg_spawned \ - and not self.species_dict[label].ts_conf_spawned \ - and all([tsg.success is not None for tsg in self.species_dict[label].ts_guesses]) \ - and any([tsg.success for tsg in self.species_dict[label].ts_guesses]): - # This is a TS Species for which all TSGs were spawned, but conformers haven't been spawned, - # and all tsg.success flags contain a values (either ``True`` or ``False``), so they are done. - # We're ready to spawn conformer jobs for this TS Species + # Non_TS needing conformer generation: + if (not self.species_dict[label].is_ts + and not self.output[label]['job_types']['opt'] + and 'opt' not in self.job_dict[label] + and 'composite' not in self.job_dict[label] + and all([e is None for e in self.species_dict[label].conformer_energies]) + and self.species_dict[label].number_of_atoms > 1 + and not self.output[label]['paths']['geo'] + and self.species_dict[label].yml_path is None + and not self.output[label]['convergence'] + and ((self.job_types['conf_opt'] and label not in self.dont_gen_confs) + or self.species_dict[label].get_xyz(generate=False) is None)): + pending_non_ts_generation.append(label) + elif (self.species_dict[label].is_ts + and self.species_dict[label].tsg_spawned + and not self.species_dict[label].ts_conf_spawned + and all([tsg.success is not None for tsg in self.species_dict[label].ts_guesses]) + and any([tsg.success for tsg in self.species_dict[label].ts_guesses])): self.run_ts_conformer_jobs(label=label) self.species_dict[label].ts_conf_spawned = True - if label in self.dont_gen_confs \ - and (self.species_dict[label].initial_xyz is not None - or self.species_dict[label].final_xyz is not None - or len(self.species_dict[label].conformers)) \ - and not self.species_dict[label].is_ts: - # The species was defined with xyzs. + if (label in self.dont_gen_confs + and (self.species_dict[label].initial_xyz is not None + or self.species_dict[label].final_xyz is not None + or len(self.species_dict[label].conformers)) + and not self.species_dict[label].is_ts): + self.process_conformers(label) + + if pending_non_ts_generation: + logger.info('\nStarting (non-TS) species conformational analysis...\n') + if self.conformer_gen_nprocs > 1 and len(pending_non_ts_generation) > 1: + # Parallel generation + with ThreadPoolExecutor(max_workers=self.conformer_gen_nprocs) as executor: + futures = { + executor.submit(self._generate_conformers_for_label, label): label + for label in pending_non_ts_generation + } + for future in as_completed(futures): + label_done = futures[future] + try: + future.result() + except Exception as e: + logger.error(f'Error generating conformers for species {label_done}: {e}') + else: + # Serial generation + for label in pending_non_ts_generation: + self._generate_conformers_for_label(label) + + for label in pending_non_ts_generation: self.process_conformers(label) def run_ts_conformer_jobs(self, label: str): diff --git a/arc/scheduler_test.py b/arc/scheduler_test.py index edba05adef..8f56ca360f 100644 --- a/arc/scheduler_test.py +++ b/arc/scheduler_test.py @@ -5,9 +5,11 @@ This module contains unit tests for the arc.scheduler module """ +import time import unittest import os import shutil +from unittest.mock import patch import arc.parser.parser as parser from arc.checks.ts import check_ts @@ -24,6 +26,29 @@ default_levels_of_theory = settings['default_levels_of_theory'] +# Helpers for MultiProcessing Test +def fake_generate_conformers(species_self, n_confs=10, e_confs=5, plot_path=None): + """Stand-in for ARCSpecies.generate_conformers. + It both *creates a dummy conformer* and *records the label* on a function attribute.""" + species_self.conformers = [{ + "symbols": ("H", "H"), + "isotopes": (1, 1), + "coords": ((0.0, 0.0, 0.0), (0.0, 0.0, 0.74)), + }] + species_self.conformer_energies = [0.0] + fake_generate_conformers.called.add(species_self.label) + + +def fake_process_conformers(sched_self, lbl): + """Stand-in for Scheduler.process_conformers.""" + fake_process_conformers.called.add(lbl) + + +# simple per-test “state holders” on the functions themselves +fake_generate_conformers.called = set() +fake_process_conformers.called = set() +# + class TestScheduler(unittest.TestCase): """ @@ -757,6 +782,93 @@ def test_add_label_to_unique_species_labels(self): self.assertEqual(unique_label, 'new_species_15_1') self.assertEqual(self.sched2.unique_species_labels, ['methylamine', 'C2H6', 'CtripCO', 'new_species_15', 'new_species_15_0', 'new_species_15_1']) + def test_run_conformer_jobs_calls_process_conformers_for_dont_gen_confs(self): + """ + If a non-TS is in dont_gen_confs and the species has initial/final xyz or existing conformers, + run_conformer_jobs should call process_conformers(label) for that species. + """ + label = "spc_dont_gen_confs" + xyz = { + 'symbols': ("H", "H"), + 'isotopes': (1, 1), + 'coords': ((0.0, 0.0, 0.0), (0.0, 0.0, 0.74)), + } + spc = ARCSpecies(label=label, smiles='[H][H]', xyz=xyz) + spc.number_of_atoms = 2 + spc.conformers = [xyz] + spc.conformer_energies = [None] + + job_types = {'conf_opt': True, 'conf_sp': False, 'opt': False, 'fine': False, 'freq': False, + 'sp': False, 'rotors': False, 'orbitals': False, 'lennard_jones': False, 'bde': False, 'irc': False, 'tsg': False} + sched = Scheduler(project='test_run_conformer_jobs_dont_gen_confs', + ess_settings=self.ess_settings, + species_list=[spc], + project_directory=os.path.join(ARC_PATH, 'Projects', 'arc_project_for_testing_delete_after_usage3'), + testing=True, + job_types=job_types) + sched.dont_gen_confs = [label] + + with patch.object(Scheduler, "process_conformers", autospec=True) as mock_proc: + sched.run_conformer_jobs(labels=[label]) + mock_proc.assert_called_once() + # First arg is Scheduler (self), second is label + called_args = mock_proc.call_args[0] + self.assertEqual(called_args[1], label) + + def test_sched_calls_real_generate_conformers(self): + spc = ARCSpecies(label="spc_real_gen_confs", multiplicity=1, charge=0, smiles='CC') + sched = Scheduler( + project='test_sched_calls_real_generate_conformers', + ess_settings=self.ess_settings, + species_list=[spc], + project_directory=os.path.join(ARC_PATH, 'Projects', 'arc_project_for_testing_delete_after_usage3'), + testing=True, + job_types={'conf_opt': True, 'conf_sp': False, 'opt': False, 'fine': False, 'freq': False, + 'sp': False, 'rotors': False, 'orbitals': False, 'lennard_jones': False, 'bde': False, 'irc': False, 'tsg': False}, + ) + + sched._generate_conformers_for_label(label=spc.label) + assert len(spc.conformers) > 0, "Real conformer generation did not populate spc.conformers" + + def test_run_conformer_jobs_generates_and_processes_for_multiple_species(self): + """ + For multiple non-TS species needing conformer generation, verify that: + - ARCSpecies.generate_conformers is invoked for each species + - Scheduler.process_conformers is subsequently called for each label + Also verify both serial (nprocs=1) and parallel (nprocs>1) code paths. + """ + job_types = { + 'conf_opt': True, 'conf_sp': False, 'opt': False, 'fine': False, 'freq': False, + 'sp': False, 'rotors': False, 'orbitals': False, 'lennard_jones': False, + 'bde': False, 'irc': False, 'tsg': False + } + time_taken = {} + for nprocs in (1, 2): + with self.subTest(nprocs=nprocs): + start_time = time.time() + fake_generate_conformers.called.clear() + fake_process_conformers.called.clear() + spc1 = ARCSpecies(label="spc_multi_1", multiplicity=1, charge=0, smiles='CCO') + spc2 = ARCSpecies(label="spc_multi_2", multiplicity=1, charge=0, smiles='CCN') + + sched = Scheduler( + project='test_run_conformer_jobs_multiple_species', + ess_settings=self.ess_settings, + species_list=[ + spc1, + spc2, + ], + project_directory=os.path.join(ARC_PATH, 'Projects', 'arc_project_for_testing_delete_after_usage3'), + testing=True, + job_types=job_types, + conformer_gen_nprocs=nprocs, + ) + + sched.run_conformer_jobs(labels=[spc1.label, spc2.label]) + time_taken[nprocs] = time.time() - start_time + print(f"nprocs={nprocs}, fake_generate_conformers.called={fake_generate_conformers.called}, fake_process_conformers.called={fake_process_conformers.called}") + + print(f"Time taken: {time_taken}") @classmethod def tearDownClass(cls): """ diff --git a/arc/species/conformers.py b/arc/species/conformers.py index 782e0b0a15..7fdddddce0 100644 --- a/arc/species/conformers.py +++ b/arc/species/conformers.py @@ -1521,6 +1521,120 @@ def read_rdkit_embedded_conformer_i(rd_mol, i, rd_index_map=None): def rdkit_force_field(label: str, + rd_mol: Optional[RDMol], + mol: Optional[Molecule] = None, + num_confs: Optional[int] = None, + force_field: str = 'MMFF94s', + try_uff: bool = True, + optimize: bool = True, + try_ob: bool = False, + return_status: bool = False, + ) -> Tuple[list, list]: + """ + Optimize RDKit conformers using a force field (MMFF94 or MMFF94s are recommended). + For UFF see: https://www.rdkit.org/docs/source/rdkit.Chem.rdForceFieldHelpers.html#rdkit.Chem.rdForceFieldHelpers.UFFOptimizeMoleculeConfs + + Args: + label (str): The species' label. + rd_mol (RDKit RDMol): The RDKit molecule with embedded conformers to optimize. + mol (Molecule, optional): The RMG molecule object with connectivity and bond order information. + num_confs (int, optional): The number of random 3D conformations to generate. + force_field (str, optional): The type of force field to use. + try_uff (bool, optional): Whether to try UFF if MMFF94(s) fails. ``True`` by default. + optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. + try_ob (bool, optional): Whether to try OpenBabel if RDKit fails. ``True`` to try, ``False`` by default. + return_status (bool, optional): Whether to return the optimization status codes. ``False`` by default. + + Returns: + If ``return_status`` is ``False``: + Tuple[list, list]: + - Entries are optimized xyz's in a dictionary format. + - Entries are float numbers representing the energies. + If ``return_status`` is ``True``: + Tuple[list, list, list]: + - Entries are optimized xyz's in a dictionary format. + - Entries are float numbers representing the energies. + - Entries are integers representing the optimization status codes (0 is success). + """ + xyzs, energies, statuses = list(), list(), list() + + if rd_mol is None or rd_mol.GetNumConformers() == 0: + return (xyzs, energies) if not return_status else (xyzs, energies, statuses) + + used_mmff_batch = False + + if optimize and "MMFF" in force_field: + try: + res = Chem.AllChem.MMFFOptimizeMoleculeConfs(rd_mol, + mmffVariant=force_field, + maxIters=1000, + ignoreInterfragInteractions=False, + numThreads=0, + ) + # res is a list of tuples [(status, energy), ...] + for i, (st, en) in enumerate(res): + statuses.append(st) + energies.append(en) + xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) + used_mmff_batch = True + except Exception as e: + logger.warning(f"MMFF optimization failed for {label} with error: {e}") + + if optimize and not xyzs and try_uff: + try: + res = Chem.AllChem.UFFOptimizeMoleculeConfs(rd_mol, + maxIters=1000, + ignoreInterfragInteractions=False, + numThreads=0, + ) + # res is a list of tuples [(status, energy), ...] + for i, (st, en) in enumerate(res): + statuses.append(st) + energies.append(en) + xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) + except (RuntimeError, ValueError) as e: + pass + + if optimize and not xyzs and try_ob: + logger.warning(f'Using OpenBabel (instead of RDKit) as a fall back method to generate conformers for {label}. ' + f'This is often slower.') + xyzs, energies = openbabel_force_field_on_rdkit_conformers(label, + rd_mol, + force_field=force_field, + optimize=optimize, + ) + if return_status: + return (xyzs, energies, [0]*len(xyzs)) # OpenBabel does not return status codes so treat all as successful + return xyzs, energies + + # Not optimize, then no energies or statuses + if not optimize: + xyzs = [read_rdkit_embedded_conformer_i(rd_mol, i) + for i in range(rd_mol.GetNumConformers())] + # Energies must remain empty in this case + return (xyzs, energies) if not return_status else (xyzs, energies, statuses) + + # Optimisation returned some conformers but not energies for all conformers + if used_mmff_batch and (len(energies) != rd_mol.GetNumConformers()): + props = Chem.AllChem.MMFFGetMoleculeProperties(rd_mol, mmffVariant=force_field) \ + if "MMFF" in force_field.upper() else None + for i in range(rd_mol.GetNumConformers()): + if len(energies) <= i: + try: + if props is not None: + ff = Chem.AllChem.MMFFGetMoleculeForceField(rd_mol, props, confId=i) + energies.append(ff.CalcEnergy()) + else: + ff = Chem.AllChem.UFFGetMoleculeForceField(rd_mol, confId=i) + energies.append(ff.CalcEnergy()) + except Exception: + energies.append(float("nan")) + if return_status and len(statuses) <= i: + statuses.append(0) + + return (xyzs, energies) if not return_status else (xyzs, energies, statuses) + +def old_rdkit_force_field(label: str, rd_mol: Optional[RDMol], mol: Optional[Molecule] = None, num_confs: Optional[int] = None, @@ -1597,6 +1711,7 @@ def rdkit_force_field(label: str, return xyzs, energies + def get_wells(label, angles, blank=WELL_GAP): """ Determine the distinct wells from a list of angles. diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000000..7b74c7fdb4 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Benchmark old vs new RDKit force-field optimization paths in ARC. + +- Embeds conformers once (shared for both paths). +- Times rdkit_force_field (new batch path) vs old_rdkit_force_field (legacy loop). +- Reports mean/median/min/max, per-conformer cost, and simple sanity checks. + +Usage examples are at the end of this file or run: python bench_ff.py -h +""" + +import argparse +import gc +import statistics as stats +import time +from typing import Tuple + +from rdkit import Chem + +# ARC imports +import arc.species.conformers as conformers +from arc.species.species import ARCSpecies + + +def _time_once(fn, *args, **kwargs) -> Tuple[float, Tuple[list, list]]: + """Time a single call and return (elapsed_seconds, (xyzs, energies)).""" + gcold = gc.isenabled() + try: + gc.disable() + t0 = time.perf_counter() + out = fn(*args, **kwargs) + dt = time.perf_counter() - t0 + finally: + if gcold: + gc.enable() + return dt, out + + +def _fmt_s(x: float) -> str: + return f"{x:.6f}s" + + +def _fmt_ms(x: float) -> str: + return f"{x*1e3:.3f} ms" + + +def run_benchmark( + smiles: str, + nconfs: int, + repeats: int, + force_field: str, + optimize: bool, + try_ob: bool, +) -> None: + label = "BENCH" + spc = ARCSpecies(label=label, smiles=smiles) + # Embed once to ensure both paths see identical starting coordinates + rd_mol = conformers.embed_rdkit(label=label, mol=spc.mol, num_confs=nconfs, xyz=None) + + # Warm-up (JIT, import caches, allocator warm, etc.) + _time_once(conformers.rdkit_force_field, label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob) + _time_once(conformers.old_rdkit_force_field, label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob) + + # Measure + new_times, old_times = [], [] + new_last, old_last = None, None + + for _ in range(repeats): + dt_new, new_last = _time_once( + conformers.rdkit_force_field, + label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob + ) + new_times.append(dt_new) + + dt_old, old_last = _time_once( + conformers.old_rdkit_force_field, + label, rd_mol, spc.mol, None, force_field, True, optimize, try_ob + ) + old_times.append(dt_old) + + # Unpack last results + new_xyzs, new_energies = new_last + old_xyzs, old_energies = old_last + + # Reporting + print("\n=== ARC RDKit FF Benchmark ===") + print(f"SMILES: {smiles}") + print(f"Conformers: {nconfs}") + print(f"Repeats: {repeats}") + print(f"Force field: {force_field}") + print(f"Optimize: {optimize}") + print(f"Try OpenBabel: {try_ob}") + print() + + def summary(times): + return { + "mean": stats.mean(times), + "median": stats.median(times), + "min": min(times), + "max": max(times), + "per_conf_mean": stats.mean(times) / max(1, nconfs), + } + + s_new = summary(new_times) + s_old = summary(old_times) + + print("New path: rdkit_force_field (batch MMFF)") + print(f" mean: {_fmt_s(s_new['mean'])} ({_fmt_ms(s_new['per_conf_mean'])}/conf)") + print(f" median: {_fmt_s(s_new['median'])}") + print(f" min: {_fmt_s(s_new['min'])}") + print(f" max: {_fmt_s(s_new['max'])}") + + print("\nOld path: old_rdkit_force_field (per-conf loop)") + print(f" mean: {_fmt_s(s_old['mean'])} ({_fmt_ms(s_old['per_conf_mean'])}/conf)") + print(f" median: {_fmt_s(s_old['median'])}") + print(f" min: {_fmt_s(s_old['min'])}") + print(f" max: {_fmt_s(s_old['max'])}") + + # Basic sanity checks + print("\n--- Sanity checks ---") + print(f"New returned: {len(new_xyzs)} xyzs, {len(new_energies)} energies") + print(f"Old returned: {len(old_xyzs)} xyzs, {len(old_energies)} energies") + + if not optimize: + print("Expect energies empty with optimize=False:") + print(f" new energies empty? {len(new_energies) == 0}") + print(f" old energies empty? {len(old_energies) == 0}") + else: + # Compare energy arrays shape, not values (values may differ due to tiny numerical drift/order) + same_len = len(new_energies) == len(old_energies) == nconfs + print(f"Energies length match nconfs? {same_len}") + if same_len and len(new_energies) and len(old_energies): + # Report a quick aggregate difference to spot gross issues + try: + import numpy as np + diff = np.nanmean(np.abs(np.array(new_energies) - np.array(old_energies))) + print(f"Mean |ΔE| (new-old): {diff:.6g}") + except Exception: + pass + + # Simple speed ratio + if s_old["mean"] > 0: + ratio = s_old["mean"] / s_new["mean"] + print(f"\nSpeedup (old/new mean): {ratio:.2f}×") + + +def main(): + p = argparse.ArgumentParser(description="Benchmark ARC RDKit force-field paths (old vs new).") + p.add_argument("--smiles", required=True, help="Input molecule as SMILES.") + p.add_argument("--nconfs", type=int, default=50, help="Number of conformers to embed.") + p.add_argument("--repeats", type=int, default=3, help="Number of timing repeats per path.") + p.add_argument("--ff", default="MMFF94s", help="Force field variant (MMFF94 or MMFF94s).") + p.add_argument("--no-opt", action="store_true", help="Disable optimization (just read coords).") + p.add_argument("--try-ob", action="store_true", help="Allow OpenBabel fallback if RDKit fails.") + args = p.parse_args() + + run_benchmark( + smiles=args.smiles, + nconfs=args.nconfs, + repeats=args.repeats, + force_field=args.ff, + optimize=not args.no_opt, + try_ob=args.try_ob, + ) + + +if __name__ == "__main__": + main() From 66057b0f6c18dcdd016ea948cb9a86321cf8f368 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Mon, 29 Sep 2025 23:21:57 +0300 Subject: [PATCH 2/3] Add parallel processing support for OpenBabel force field optimization in conformers module --- arc/species/conformers.py | 254 +++++++++++++++++---------------- arc/species/conformers_test.py | 38 ++++- 2 files changed, 168 insertions(+), 124 deletions(-) diff --git a/arc/species/conformers.py b/arc/species/conformers.py index 7fdddddce0..c72e34b365 100644 --- a/arc/species/conformers.py +++ b/arc/species/conformers.py @@ -35,13 +35,15 @@ get_lowest_confs """ - +import os import copy import logging import sys import time from itertools import product from typing import Dict, List, Optional, Tuple, Union +from concurrent.futures import ProcessPoolExecutor, as_completed +from itertools import repeat from openbabel import openbabel as ob from openbabel import pybel as pyb @@ -1230,8 +1232,35 @@ def get_force_field_energies(label: str, f'Ghemical, or GAFF. Got: {force_field}.') return xyzs, energies +def _ob_optimize_xyz(xyz_str, force_field, optimize = True): + """ + A helper function to optimize one xyz string with OpenBabel in a child process. + This avoids OpenBabel's internal errors when used in parallel. + + Args: + xyz_str (str): The xyz string to optimize. + force_field (str): The force field to use. + optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. -def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94s', optimize=True): + Returns: + Tuple[str, float]: The optimized xyz (in ARC XYZ format) and its energy. + """ + obconv = ob.OBConversion() + obconv.SetInAndOutFormats('xyz', 'xyz') + obmol = ob.OBMol() + if not obconv.ReadString(obmol, xyz_str): + return None, None + ff = ob.OBForceField.FindForceField(force_field) + if ff is None or not ff.Setup(obmol): + return None, None + if optimize: + ff.ConjugateGradients(2000) + en = ff.Energy() + out_xyz = '\n'.join(obconv.WriteString(obmol).splitlines()[2:]) + return converter.str_to_xyz(out_xyz), en + + +def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94s', optimize=True, nprocs: int = 1): """ Optimize RDKit conformers by OpenBabel using a force field (MMFF94 or MMFF94s are recommended). This is a fallback method when RDKit fails to generate force field optimized conformers. @@ -1241,6 +1270,7 @@ def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94 rd_mol (RDKit RDMol): The RDKit molecule with embedded conformers to optimize. force_field (str, optional): The type of force field to use. optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. + nprocs (int, optional): The number of processors to use. Returns: Tuple[list, list]: @@ -1251,33 +1281,45 @@ def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94 if not rd_mol.GetNumConformers(): logger.warning(f'Could not generate conformers for {label} via OpenBabel') return xyzs, energies - # Set up Openbabel input and output format + + # Let's prepare XYZ strings once in parent (avoid pickling issues with OB objects) obconversion = ob.OBConversion() obconversion.SetInAndOutFormats('xyz', 'xyz') - # Set up Openbabel force field - ff = ob.OBForceField.FindForceField(force_field) symbols = [rd_atom.GetSymbol() for rd_atom in rd_mol.GetAtoms()] + + xyz_jobs = list() for i in range(rd_mol.GetNumConformers()): - # Convert RDKit conformer to xyz string conf = rd_mol.GetConformer(i) xyz_str = f'{conf.GetNumAtoms()}\n\n' for j in range(conf.GetNumAtoms()): - xyz_str += symbols[j] + ' ' pt = conf.GetAtomPosition(j) - xyz_str += ' '.join([str(pt.x), str(pt.y), str(pt.z)]) + '\n' - # Build OpenBabel molecule from xyz string + xyz_str += f"{symbols[j]} {pt.x} {pt.y} {pt.z}\n" + xyz_jobs.append((xyz_str, force_field, optimize)) + + if nprocs and nprocs > 1 and len(xyz_jobs) > 1: + os.environ.setdefault('OMP_NUM_THREADS', '1') + with ProcessPoolExecutor(max_workers=nprocs) as executor: + xyz_iter = (s for (s, _, _) in xyz_jobs) + ff_iter = repeat(force_field) + opt_iter = repeat(optimize) + for xyz_dict, en in executor.map(_ob_optimize_xyz, xyz_iter, ff_iter, opt_iter): + if xyz_dict is not None: + xyzs.append(xyz_dict) + energies.append(en) + return xyzs, energies + + ff = ob.OBForceField.FindForceField(force_field) + for xyz_str, _, _ in xyz_jobs: ob_mol = ob.OBMol() obconversion.ReadString(ob_mol, xyz_str) ff.Setup(ob_mol) - # Optimize the molecule if needed if optimize: ff.ConjugateGradients(2000) - # Export xyzs and energies ob_mol.GetCoordinates() ff.GetCoordinates(ob_mol) energies.append(ff.Energy()) - xyz_str = '\n'.join(obconversion.WriteString(ob_mol).splitlines()[2:]) - xyzs.append(converter.str_to_xyz(xyz_str)) + out_xyz = '\n'.join(obconversion.WriteString(ob_mol).splitlines()[2:]) + xyzs.append(converter.str_to_xyz(out_xyz)) return xyzs, energies @@ -1332,8 +1374,34 @@ def mix_rdkit_and_openbabel_force_field(label, energies.extend(energies_) return xyzs, energies +def _optimize_one_conf_with_ob(args): + """ + A helper function for parallelizing the optimization of one conformer with OpenBabel in a child process. + This avoids OB object pickling issues. + """ + xyz_str, force_field = args + obconv = ob.OBConversion() + obconv.SetInAndOutFormats('xyz', 'xyz') + + obmol = ob.OBMol() + if not obconv.ReadString(obmol, xyz_str): + return None, None + + ff = ob.OBForceField.FindForceField(force_field) + if ff is None: + return None, None + + if not ff.Setup(obmol): + return None, None + + en = ff.Energy() + # Export xyz as ARC dict + ob_out = obconv.WriteString(obmol) + xyz_body = '\n'.join(ob_out.splitlines()[2:]) + xyz_dict = converter.str_to_xyz(xyz_body) + return xyz_dict, en -def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAFF', method='diverse'): +def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAFF', method='diverse', nprocs=1): """ Optimize conformers using a force field (GAFF, MMFF94s, MMFF94, UFF, Ghemical). @@ -1345,43 +1413,46 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF force_field (str, optional): The type of force field to use. method (str, optional): The conformer searching method to use in OpenBabel. For method description, see https://openbabel.org/dev-api/group__conformer.shtml + nprocs (int, optional): The number of processors to use. If >1, will parallelize the optimization of conformers. Returns: Tuple[list, list]: - Entries are optimized xyz's in a list format. - Entries are float numbers representing the energies in kJ/mol. - """ + """ xyzs, energies = list(), list() ff = ob.OBForceField.FindForceField(force_field) + if ff is None: + raise RuntimeError(f'Could not find force field {force_field} in OpenBabel for {label}.') + if xyz is not None: obmol = ob.OBMol() atoms = mol.vertices - ob_atom_ids = dict() for i, atom in enumerate(atoms): a = obmol.NewAtom() a.SetAtomicNum(atom.number) - a.SetVector(xyz['coords'][i][0], xyz['coords'][i][1], xyz['coords'][i][2]) + x, y, z = xyz['coords'][i] + a.SetVector(x, y, z) if atom.element.isotope != -1: a.SetIsotope(atom.element.isotope) a.SetFormalCharge(atom.charge) - ob_atom_ids[atom] = a.GetId() orders = {1: 1, 2: 2, 3: 3, 4: 4, 1.5: 5} - for atom1 in mol.vertices: - for atom2, bond in atom1.edges.items(): + for a1 in mol.vertices: + for a2, bond in a1.edges.items(): if bond.is_hydrogen_bond(): continue - index1 = atoms.index(atom1) - index2 = atoms.index(atom2) + index1 = atoms.index(a1) + index2 = atoms.index(a2) if index1 < index2: obmol.AddBond(index1 + 1, index2 + 1, orders[bond.order]) - + # optimize ff.Setup(obmol) ff.SetLogLevel(0) ff.SetVDWCutOff(6.0) # The VDW cut-off distance (default=6.0) ff.SetElectrostaticCutOff(10.0) # The Electrostatic cut-off distance (default=10.0) - ff.SetUpdateFrequency(10) # The frequency to update the non-bonded pairs (default=10) + ff.SetUpdateFrequency(10) # The frequency to update the non-bonded pairs (default=10 ff.EnableCutOff(False) # Use cut-off (default=don't use cut-off) ff.SteepestDescentInitialize() # ConjugateGradientsInitialize v = 1 @@ -1390,14 +1461,16 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF if ff.DetectExplosion(): raise ConformerError(f'Force field {force_field} exploded with method SteepestDescent for {label}') ff.GetCoordinates(obmol) - + elif num_confs is not None: - obmol, ob_atom_ids = to_ob_mol(mol, return_mapping=True) + obmol, _idmap = to_ob_mol(mol, return_mapping=True) pybmol = pyb.Molecule(obmol) pybmol.make3D() obmol = pybmol.OBMol - ff.Setup(obmol) - + + if not ff.Setup(obmol): + return xyzs, energies + if method.lower() == 'weighted': ff.WeightedRotorSearch(num_confs, 2000) elif method.lower() == 'random': @@ -1411,25 +1484,38 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF ff.SystematicRotorSearch(num_confs) else: raise ConformerError(f'Could not identify method {method} for {label}') - else: - raise ConformerError(f'Either num_confs or xyz should be given for {label}') - - ff.GetConformers(obmol) - obconversion = ob.OBConversion() - obconversion.SetOutFormat('xyz') - - for i in range(obmol.NumConformers()): - obmol.SetConformer(i) - ff.Setup(obmol) - xyz_str = '\n'.join(obconversion.WriteString(obmol).splitlines()[2:]) - xyz_dict = converter.str_to_xyz(xyz_str) - xyz_dict['coords'] = tuple(xyz_dict['coords'][ob_atom_ids[mol.atoms[j]]] - for j in range(len(xyz_dict['coords']))) - xyzs.append(xyz_dict) - energies.append(ff.Energy()) + + ff.GetConformers(obmol) + obconversion = ob.OBConversion() + obconversion.SetOutFormat('xyz') + + xyz_jobs = [] + for i in range(obmol.NumConformers()): + obmol.SetConformer(i) + ff.Setup(obmol) + xyz_full = obconversion.WriteString(obmol) + xyz_jobs.append((xyz_full, force_field)) + + if nprocs and nprocs > 1: + # Avoid oversubscription if OB uses any internal threads + os.environ.setdefault('OMP_NUM_THREADS', '1') + with ProcessPoolExecutor(max_workers=nprocs) as executor: + futs = [executor.submit(_optimize_one_conf_with_ob, job) for job in xyz_jobs] + for fut in as_completed(futs): + res = fut.result() + if res is not None: + x, e = res + xyzs.append(x) + energies.append(e) + else: + for job in xyz_jobs: + x, e = _optimize_one_conf_with_ob(job) + if x is not None: + xyzs.append(x) + energies.append(e) + return xyzs, energies - def embed_rdkit(label, mol, num_confs=None, xyz=None): """ Generate unoptimized conformers in RDKit. If ``xyz`` is not given, random conformers will be generated. @@ -1634,84 +1720,6 @@ def rdkit_force_field(label: str, return (xyzs, energies) if not return_status else (xyzs, energies, statuses) -def old_rdkit_force_field(label: str, - rd_mol: Optional[RDMol], - mol: Optional[Molecule] = None, - num_confs: Optional[int] = None, - force_field: str = 'MMFF94s', - try_uff: bool = True, - optimize: bool = True, - try_ob: bool = False, - ) -> Tuple[list, list]: - """ - Optimize RDKit conformers using a force field (MMFF94 or MMFF94s are recommended). - For UFF see: https://www.rdkit.org/docs/source/rdkit.Chem.rdForceFieldHelpers.html#rdkit.Chem.rdForceFieldHelpers.UFFOptimizeMoleculeConfs - - Args: - label (str): The species' label. - rd_mol (RDKit RDMol): The RDKit molecule with embedded conformers to optimize. - mol (Molecule, optional): The RMG molecule object with connectivity and bond order information. - num_confs (int, optional): The number of random 3D conformations to generate. - force_field (str, optional): The type of force field to use. - try_uff (bool, optional): Whether to try UFF if MMFF94(s) fails. ``True`` by default. - optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. - try_ob (bool, optional): Whether to try OpenBabel if RDKit fails. ``True`` to try, ``False`` by default. - - Returns: - Tuple[list, list]: - - Entries are optimized xyz's in a dictionary format. - - Entries are float numbers representing the energies. - """ - xyzs, energies = list(), list() - if rd_mol is None: - return xyzs, energies - for i in range(rd_mol.GetNumConformers()): - if optimize: - v, j = 1, 0 - while v == 1 and j < 200: # v == 1: continue, v == 0: enough steps, v == -1: unable to set up - try: - v = Chem.AllChem.MMFFOptimizeMolecule(rd_mol, - mmffVariant=force_field, - confId=i, - maxIters=500, - ignoreInterfragInteractions=False, - ) - except: - pass - else: - j += 1 - mol_properties = Chem.AllChem.MMFFGetMoleculeProperties(rd_mol, mmffVariant=force_field) - if mol_properties is not None: - ff = Chem.AllChem.MMFFGetMoleculeForceField(rd_mol, mol_properties, confId=i) - if optimize: - energies.append(ff.CalcEnergy()) - xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) - if not len(xyzs) and 'MMFF' in force_field and try_uff: - output = None - if optimize: - try: - output = Chem.AllChem.UFFOptimizeMoleculeConfs(rd_mol, - maxIters=200, - ignoreInterfragInteractions=True, - ) - except (RuntimeError, ValueError): - if try_ob: - logger.warning(f'Using OpenBabel (instead of RDKit) as a fall back method to generate conformers ' - f'for {label}. This is often slower.') - xyzs, energies = openbabel_force_field_on_rdkit_conformers(label, - rd_mol, - force_field=force_field, - optimize=optimize, - ) - return xyzs, energies - for i in range(rd_mol.GetNumConformers()): - if output is not None and output[i][0] == 0: # The optimization converged. - energies.append(output[i][1]) - xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) - return xyzs, energies - - - def get_wells(label, angles, blank=WELL_GAP): """ Determine the distinct wells from a list of angles. diff --git a/arc/species/conformers_test.py b/arc/species/conformers_test.py index ff7d028dc3..1a380edbdf 100644 --- a/arc/species/conformers_test.py +++ b/arc/species/conformers_test.py @@ -6,6 +6,7 @@ """ import unittest +import pytest from rdkit.Chem import rdMolTransforms as rdMT @@ -554,7 +555,7 @@ def test_openbabel_force_field(self): method='diverse', ) self.assertEqual(len(xyzs), 1) - self.assertAlmostEqual(energies[0], 2.931930, 3) + self.assertAlmostEqual(energies[0], 2.931930, delta=0.001) def test_openbabel_force_field_on_rdkit_conformers(self): """Test Open Babel force field on RDKit conformers""" @@ -604,6 +605,41 @@ def test_openbabel_force_field_on_rdkit_conformers(self): self.assertEqual(xyzs[0]['symbols'], expected_xyzs[0]['symbols']) self.assertEqual(xyzs[1]['symbols'], expected_xyzs[1]['symbols']) + def test_openbabel_force_field_on_rdkit_conformers_parallel(self): + """Parallel and serial OB-on-RDKit paths give identical results (up to ordering).""" + xyz = converter.str_to_xyz("""C -2.18276 2.03598 0.00028 + C -0.83696 1.34108 -0.05231 + H -2.23808 2.82717 -0.75474 + H -2.33219 2.51406 0.97405 + H -2.99589 1.32546 -0.17267 + O 0.18176 2.30786 0.17821 + H -0.69161 0.88171 -1.03641 + H -0.78712 0.56391 0.71847 + O 1.39175 1.59510 0.11494""") + spc = ARCSpecies(label='CCO[O]', smiles='CCO[O]', xyz=xyz) + rd_mol = conformers.embed_rdkit(label='', mol=spc.mol, num_confs=2, xyz=xyz) + + # serial + xyzs_s, energies_s = conformers.openbabel_force_field_on_rdkit_conformers( + label='', rd_mol=rd_mol, force_field='MMFF94s', optimize=True, nprocs=1 + ) + # parallel + xyzs_p, energies_p = conformers.openbabel_force_field_on_rdkit_conformers( + label='', rd_mol=rd_mol, force_field='MMFF94s', optimize=True, nprocs=2 + ) + + # same counts + self.assertEqual(len(energies_s), len(energies_p)) + self.assertEqual(len(xyzs_s), len(xyzs_p)) + + # energies equal up to ordering + self.assertEqual(sorted(energies_s), pytest.approx(sorted(energies_p), abs=1e-8)) + + # compare symbols only (OB coords can differ slightly across platforms) + sym_s = sorted([tuple(x['symbols']) for x in xyzs_s]) + sym_p = sorted([tuple(x['symbols']) for x in xyzs_p]) + self.assertEqual(sym_s, sym_p) + def test_embed_rdkit(self): """Test embedding in RDKit""" rd_mol = conformers.embed_rdkit(label='CJ', mol=self.cj_spc.mol, num_confs=1) From a0ce7f67c497fe95c0f1868fdee489b625501ba6 Mon Sep 17 00:00:00 2001 From: Calvin Pieters Date: Wed, 8 Oct 2025 15:53:54 +0300 Subject: [PATCH 3/3] Parallelizes OpenBabel conformer optimization Speeds up conformer generation by parallelizing the OpenBabel force field optimization. This involves refactoring the OpenBabel conformer optimization functions to allow for parallel processing using ProcessPoolExecutor, improving performance for species with many conformers. --- arc/species/conformers.py | 254 +++++++++++++++++---------------- arc/species/conformers_test.py | 38 ++++- 2 files changed, 168 insertions(+), 124 deletions(-) diff --git a/arc/species/conformers.py b/arc/species/conformers.py index 7fdddddce0..c72e34b365 100644 --- a/arc/species/conformers.py +++ b/arc/species/conformers.py @@ -35,13 +35,15 @@ get_lowest_confs """ - +import os import copy import logging import sys import time from itertools import product from typing import Dict, List, Optional, Tuple, Union +from concurrent.futures import ProcessPoolExecutor, as_completed +from itertools import repeat from openbabel import openbabel as ob from openbabel import pybel as pyb @@ -1230,8 +1232,35 @@ def get_force_field_energies(label: str, f'Ghemical, or GAFF. Got: {force_field}.') return xyzs, energies +def _ob_optimize_xyz(xyz_str, force_field, optimize = True): + """ + A helper function to optimize one xyz string with OpenBabel in a child process. + This avoids OpenBabel's internal errors when used in parallel. + + Args: + xyz_str (str): The xyz string to optimize. + force_field (str): The force field to use. + optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. -def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94s', optimize=True): + Returns: + Tuple[str, float]: The optimized xyz (in ARC XYZ format) and its energy. + """ + obconv = ob.OBConversion() + obconv.SetInAndOutFormats('xyz', 'xyz') + obmol = ob.OBMol() + if not obconv.ReadString(obmol, xyz_str): + return None, None + ff = ob.OBForceField.FindForceField(force_field) + if ff is None or not ff.Setup(obmol): + return None, None + if optimize: + ff.ConjugateGradients(2000) + en = ff.Energy() + out_xyz = '\n'.join(obconv.WriteString(obmol).splitlines()[2:]) + return converter.str_to_xyz(out_xyz), en + + +def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94s', optimize=True, nprocs: int = 1): """ Optimize RDKit conformers by OpenBabel using a force field (MMFF94 or MMFF94s are recommended). This is a fallback method when RDKit fails to generate force field optimized conformers. @@ -1241,6 +1270,7 @@ def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94 rd_mol (RDKit RDMol): The RDKit molecule with embedded conformers to optimize. force_field (str, optional): The type of force field to use. optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. + nprocs (int, optional): The number of processors to use. Returns: Tuple[list, list]: @@ -1251,33 +1281,45 @@ def openbabel_force_field_on_rdkit_conformers(label, rd_mol, force_field='MMFF94 if not rd_mol.GetNumConformers(): logger.warning(f'Could not generate conformers for {label} via OpenBabel') return xyzs, energies - # Set up Openbabel input and output format + + # Let's prepare XYZ strings once in parent (avoid pickling issues with OB objects) obconversion = ob.OBConversion() obconversion.SetInAndOutFormats('xyz', 'xyz') - # Set up Openbabel force field - ff = ob.OBForceField.FindForceField(force_field) symbols = [rd_atom.GetSymbol() for rd_atom in rd_mol.GetAtoms()] + + xyz_jobs = list() for i in range(rd_mol.GetNumConformers()): - # Convert RDKit conformer to xyz string conf = rd_mol.GetConformer(i) xyz_str = f'{conf.GetNumAtoms()}\n\n' for j in range(conf.GetNumAtoms()): - xyz_str += symbols[j] + ' ' pt = conf.GetAtomPosition(j) - xyz_str += ' '.join([str(pt.x), str(pt.y), str(pt.z)]) + '\n' - # Build OpenBabel molecule from xyz string + xyz_str += f"{symbols[j]} {pt.x} {pt.y} {pt.z}\n" + xyz_jobs.append((xyz_str, force_field, optimize)) + + if nprocs and nprocs > 1 and len(xyz_jobs) > 1: + os.environ.setdefault('OMP_NUM_THREADS', '1') + with ProcessPoolExecutor(max_workers=nprocs) as executor: + xyz_iter = (s for (s, _, _) in xyz_jobs) + ff_iter = repeat(force_field) + opt_iter = repeat(optimize) + for xyz_dict, en in executor.map(_ob_optimize_xyz, xyz_iter, ff_iter, opt_iter): + if xyz_dict is not None: + xyzs.append(xyz_dict) + energies.append(en) + return xyzs, energies + + ff = ob.OBForceField.FindForceField(force_field) + for xyz_str, _, _ in xyz_jobs: ob_mol = ob.OBMol() obconversion.ReadString(ob_mol, xyz_str) ff.Setup(ob_mol) - # Optimize the molecule if needed if optimize: ff.ConjugateGradients(2000) - # Export xyzs and energies ob_mol.GetCoordinates() ff.GetCoordinates(ob_mol) energies.append(ff.Energy()) - xyz_str = '\n'.join(obconversion.WriteString(ob_mol).splitlines()[2:]) - xyzs.append(converter.str_to_xyz(xyz_str)) + out_xyz = '\n'.join(obconversion.WriteString(ob_mol).splitlines()[2:]) + xyzs.append(converter.str_to_xyz(out_xyz)) return xyzs, energies @@ -1332,8 +1374,34 @@ def mix_rdkit_and_openbabel_force_field(label, energies.extend(energies_) return xyzs, energies +def _optimize_one_conf_with_ob(args): + """ + A helper function for parallelizing the optimization of one conformer with OpenBabel in a child process. + This avoids OB object pickling issues. + """ + xyz_str, force_field = args + obconv = ob.OBConversion() + obconv.SetInAndOutFormats('xyz', 'xyz') + + obmol = ob.OBMol() + if not obconv.ReadString(obmol, xyz_str): + return None, None + + ff = ob.OBForceField.FindForceField(force_field) + if ff is None: + return None, None + + if not ff.Setup(obmol): + return None, None + + en = ff.Energy() + # Export xyz as ARC dict + ob_out = obconv.WriteString(obmol) + xyz_body = '\n'.join(ob_out.splitlines()[2:]) + xyz_dict = converter.str_to_xyz(xyz_body) + return xyz_dict, en -def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAFF', method='diverse'): +def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAFF', method='diverse', nprocs=1): """ Optimize conformers using a force field (GAFF, MMFF94s, MMFF94, UFF, Ghemical). @@ -1345,43 +1413,46 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF force_field (str, optional): The type of force field to use. method (str, optional): The conformer searching method to use in OpenBabel. For method description, see https://openbabel.org/dev-api/group__conformer.shtml + nprocs (int, optional): The number of processors to use. If >1, will parallelize the optimization of conformers. Returns: Tuple[list, list]: - Entries are optimized xyz's in a list format. - Entries are float numbers representing the energies in kJ/mol. - """ + """ xyzs, energies = list(), list() ff = ob.OBForceField.FindForceField(force_field) + if ff is None: + raise RuntimeError(f'Could not find force field {force_field} in OpenBabel for {label}.') + if xyz is not None: obmol = ob.OBMol() atoms = mol.vertices - ob_atom_ids = dict() for i, atom in enumerate(atoms): a = obmol.NewAtom() a.SetAtomicNum(atom.number) - a.SetVector(xyz['coords'][i][0], xyz['coords'][i][1], xyz['coords'][i][2]) + x, y, z = xyz['coords'][i] + a.SetVector(x, y, z) if atom.element.isotope != -1: a.SetIsotope(atom.element.isotope) a.SetFormalCharge(atom.charge) - ob_atom_ids[atom] = a.GetId() orders = {1: 1, 2: 2, 3: 3, 4: 4, 1.5: 5} - for atom1 in mol.vertices: - for atom2, bond in atom1.edges.items(): + for a1 in mol.vertices: + for a2, bond in a1.edges.items(): if bond.is_hydrogen_bond(): continue - index1 = atoms.index(atom1) - index2 = atoms.index(atom2) + index1 = atoms.index(a1) + index2 = atoms.index(a2) if index1 < index2: obmol.AddBond(index1 + 1, index2 + 1, orders[bond.order]) - + # optimize ff.Setup(obmol) ff.SetLogLevel(0) ff.SetVDWCutOff(6.0) # The VDW cut-off distance (default=6.0) ff.SetElectrostaticCutOff(10.0) # The Electrostatic cut-off distance (default=10.0) - ff.SetUpdateFrequency(10) # The frequency to update the non-bonded pairs (default=10) + ff.SetUpdateFrequency(10) # The frequency to update the non-bonded pairs (default=10 ff.EnableCutOff(False) # Use cut-off (default=don't use cut-off) ff.SteepestDescentInitialize() # ConjugateGradientsInitialize v = 1 @@ -1390,14 +1461,16 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF if ff.DetectExplosion(): raise ConformerError(f'Force field {force_field} exploded with method SteepestDescent for {label}') ff.GetCoordinates(obmol) - + elif num_confs is not None: - obmol, ob_atom_ids = to_ob_mol(mol, return_mapping=True) + obmol, _idmap = to_ob_mol(mol, return_mapping=True) pybmol = pyb.Molecule(obmol) pybmol.make3D() obmol = pybmol.OBMol - ff.Setup(obmol) - + + if not ff.Setup(obmol): + return xyzs, energies + if method.lower() == 'weighted': ff.WeightedRotorSearch(num_confs, 2000) elif method.lower() == 'random': @@ -1411,25 +1484,38 @@ def openbabel_force_field(label, mol, num_confs=None, xyz=None, force_field='GAF ff.SystematicRotorSearch(num_confs) else: raise ConformerError(f'Could not identify method {method} for {label}') - else: - raise ConformerError(f'Either num_confs or xyz should be given for {label}') - - ff.GetConformers(obmol) - obconversion = ob.OBConversion() - obconversion.SetOutFormat('xyz') - - for i in range(obmol.NumConformers()): - obmol.SetConformer(i) - ff.Setup(obmol) - xyz_str = '\n'.join(obconversion.WriteString(obmol).splitlines()[2:]) - xyz_dict = converter.str_to_xyz(xyz_str) - xyz_dict['coords'] = tuple(xyz_dict['coords'][ob_atom_ids[mol.atoms[j]]] - for j in range(len(xyz_dict['coords']))) - xyzs.append(xyz_dict) - energies.append(ff.Energy()) + + ff.GetConformers(obmol) + obconversion = ob.OBConversion() + obconversion.SetOutFormat('xyz') + + xyz_jobs = [] + for i in range(obmol.NumConformers()): + obmol.SetConformer(i) + ff.Setup(obmol) + xyz_full = obconversion.WriteString(obmol) + xyz_jobs.append((xyz_full, force_field)) + + if nprocs and nprocs > 1: + # Avoid oversubscription if OB uses any internal threads + os.environ.setdefault('OMP_NUM_THREADS', '1') + with ProcessPoolExecutor(max_workers=nprocs) as executor: + futs = [executor.submit(_optimize_one_conf_with_ob, job) for job in xyz_jobs] + for fut in as_completed(futs): + res = fut.result() + if res is not None: + x, e = res + xyzs.append(x) + energies.append(e) + else: + for job in xyz_jobs: + x, e = _optimize_one_conf_with_ob(job) + if x is not None: + xyzs.append(x) + energies.append(e) + return xyzs, energies - def embed_rdkit(label, mol, num_confs=None, xyz=None): """ Generate unoptimized conformers in RDKit. If ``xyz`` is not given, random conformers will be generated. @@ -1634,84 +1720,6 @@ def rdkit_force_field(label: str, return (xyzs, energies) if not return_status else (xyzs, energies, statuses) -def old_rdkit_force_field(label: str, - rd_mol: Optional[RDMol], - mol: Optional[Molecule] = None, - num_confs: Optional[int] = None, - force_field: str = 'MMFF94s', - try_uff: bool = True, - optimize: bool = True, - try_ob: bool = False, - ) -> Tuple[list, list]: - """ - Optimize RDKit conformers using a force field (MMFF94 or MMFF94s are recommended). - For UFF see: https://www.rdkit.org/docs/source/rdkit.Chem.rdForceFieldHelpers.html#rdkit.Chem.rdForceFieldHelpers.UFFOptimizeMoleculeConfs - - Args: - label (str): The species' label. - rd_mol (RDKit RDMol): The RDKit molecule with embedded conformers to optimize. - mol (Molecule, optional): The RMG molecule object with connectivity and bond order information. - num_confs (int, optional): The number of random 3D conformations to generate. - force_field (str, optional): The type of force field to use. - try_uff (bool, optional): Whether to try UFF if MMFF94(s) fails. ``True`` by default. - optimize (bool, optional): Whether to first optimize the conformer using FF. True to optimize. - try_ob (bool, optional): Whether to try OpenBabel if RDKit fails. ``True`` to try, ``False`` by default. - - Returns: - Tuple[list, list]: - - Entries are optimized xyz's in a dictionary format. - - Entries are float numbers representing the energies. - """ - xyzs, energies = list(), list() - if rd_mol is None: - return xyzs, energies - for i in range(rd_mol.GetNumConformers()): - if optimize: - v, j = 1, 0 - while v == 1 and j < 200: # v == 1: continue, v == 0: enough steps, v == -1: unable to set up - try: - v = Chem.AllChem.MMFFOptimizeMolecule(rd_mol, - mmffVariant=force_field, - confId=i, - maxIters=500, - ignoreInterfragInteractions=False, - ) - except: - pass - else: - j += 1 - mol_properties = Chem.AllChem.MMFFGetMoleculeProperties(rd_mol, mmffVariant=force_field) - if mol_properties is not None: - ff = Chem.AllChem.MMFFGetMoleculeForceField(rd_mol, mol_properties, confId=i) - if optimize: - energies.append(ff.CalcEnergy()) - xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) - if not len(xyzs) and 'MMFF' in force_field and try_uff: - output = None - if optimize: - try: - output = Chem.AllChem.UFFOptimizeMoleculeConfs(rd_mol, - maxIters=200, - ignoreInterfragInteractions=True, - ) - except (RuntimeError, ValueError): - if try_ob: - logger.warning(f'Using OpenBabel (instead of RDKit) as a fall back method to generate conformers ' - f'for {label}. This is often slower.') - xyzs, energies = openbabel_force_field_on_rdkit_conformers(label, - rd_mol, - force_field=force_field, - optimize=optimize, - ) - return xyzs, energies - for i in range(rd_mol.GetNumConformers()): - if output is not None and output[i][0] == 0: # The optimization converged. - energies.append(output[i][1]) - xyzs.append(read_rdkit_embedded_conformer_i(rd_mol, i)) - return xyzs, energies - - - def get_wells(label, angles, blank=WELL_GAP): """ Determine the distinct wells from a list of angles. diff --git a/arc/species/conformers_test.py b/arc/species/conformers_test.py index ff7d028dc3..1a380edbdf 100644 --- a/arc/species/conformers_test.py +++ b/arc/species/conformers_test.py @@ -6,6 +6,7 @@ """ import unittest +import pytest from rdkit.Chem import rdMolTransforms as rdMT @@ -554,7 +555,7 @@ def test_openbabel_force_field(self): method='diverse', ) self.assertEqual(len(xyzs), 1) - self.assertAlmostEqual(energies[0], 2.931930, 3) + self.assertAlmostEqual(energies[0], 2.931930, delta=0.001) def test_openbabel_force_field_on_rdkit_conformers(self): """Test Open Babel force field on RDKit conformers""" @@ -604,6 +605,41 @@ def test_openbabel_force_field_on_rdkit_conformers(self): self.assertEqual(xyzs[0]['symbols'], expected_xyzs[0]['symbols']) self.assertEqual(xyzs[1]['symbols'], expected_xyzs[1]['symbols']) + def test_openbabel_force_field_on_rdkit_conformers_parallel(self): + """Parallel and serial OB-on-RDKit paths give identical results (up to ordering).""" + xyz = converter.str_to_xyz("""C -2.18276 2.03598 0.00028 + C -0.83696 1.34108 -0.05231 + H -2.23808 2.82717 -0.75474 + H -2.33219 2.51406 0.97405 + H -2.99589 1.32546 -0.17267 + O 0.18176 2.30786 0.17821 + H -0.69161 0.88171 -1.03641 + H -0.78712 0.56391 0.71847 + O 1.39175 1.59510 0.11494""") + spc = ARCSpecies(label='CCO[O]', smiles='CCO[O]', xyz=xyz) + rd_mol = conformers.embed_rdkit(label='', mol=spc.mol, num_confs=2, xyz=xyz) + + # serial + xyzs_s, energies_s = conformers.openbabel_force_field_on_rdkit_conformers( + label='', rd_mol=rd_mol, force_field='MMFF94s', optimize=True, nprocs=1 + ) + # parallel + xyzs_p, energies_p = conformers.openbabel_force_field_on_rdkit_conformers( + label='', rd_mol=rd_mol, force_field='MMFF94s', optimize=True, nprocs=2 + ) + + # same counts + self.assertEqual(len(energies_s), len(energies_p)) + self.assertEqual(len(xyzs_s), len(xyzs_p)) + + # energies equal up to ordering + self.assertEqual(sorted(energies_s), pytest.approx(sorted(energies_p), abs=1e-8)) + + # compare symbols only (OB coords can differ slightly across platforms) + sym_s = sorted([tuple(x['symbols']) for x in xyzs_s]) + sym_p = sorted([tuple(x['symbols']) for x in xyzs_p]) + self.assertEqual(sym_s, sym_p) + def test_embed_rdkit(self): """Test embedding in RDKit""" rd_mol = conformers.embed_rdkit(label='CJ', mol=self.cj_spc.mol, num_confs=1)