Skip to content

Commit a011463

Browse files
authored
Make failures more visible (#179)
- *Category*: feature - *JIRA issue*: [MIC-4265](https://jira.ihme.washington.edu/browse/MIC-4265) Changes and notes - Adds a warning at psimulate epilogue if there were failed jobs - Updates final job completion log message to include status (finished/total)
1 parent 47831f6 commit a011463

File tree

2 files changed

+21
-5
lines changed

2 files changed

+21
-5
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
**1.3.11 - 09/07/23**
2+
3+
- Made job failures more prominent in end of jobs logging
4+
15
**1.3.10 - 07/12/23**
26

37
- Allow for specifying random seeds and draws in branches file

src/vivarium_cluster_tools/psimulate/runner.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
The main process loop for `psimulate` runs.
77
88
"""
9-
import atexit
9+
from collections import defaultdict
1010
from pathlib import Path
1111
from time import sleep, time
1212

@@ -34,10 +34,11 @@ def process_job_results(
3434
existing_outputs: pd.DataFrame,
3535
output_directory: Path,
3636
no_batch: bool,
37-
) -> None:
37+
) -> defaultdict:
3838
written_results = existing_outputs
3939
unwritten_results = []
4040
batch_size = 0 if no_batch else 200
41+
status = defaultdict(int)
4142

4243
logger.info("Entering main processing loop.")
4344
start_time = time()
@@ -54,7 +55,7 @@ def process_job_results(
5455
batch_size,
5556
)
5657

57-
registry_manager.update_and_report()
58+
status = registry_manager.update_and_report()
5859
logger.info(f"Unwritten results: {len(unwritten_results)}")
5960
logger.info(f"Elapsed time: {(time() - start_time)/60:.1f} minutes.")
6061
finally:
@@ -68,6 +69,7 @@ def process_job_results(
6869
)
6970
logger.info(f"Unwritten results: {len(unwritten_results)}")
7071
logger.info(f"Elapsed time: {(time() - start_time) / 60:.1f} minutes.")
72+
return status
7173

7274

7375
def load_existing_outputs(result_path: Path, restart: bool) -> pd.DataFrame:
@@ -238,7 +240,7 @@ def main(
238240
# Enter the main monitoring and processing loop, which will check on
239241
# all the queues periodically, report status updates, and gather
240242
# and write results when they are available.
241-
process_job_results(
243+
status = process_job_results(
242244
registry_manager=registry_manager,
243245
existing_outputs=existing_outputs,
244246
output_directory=output_paths.root,
@@ -248,4 +250,14 @@ def main(
248250
# Spit out a performance report for the workers.
249251
try_run_vipin(output_paths.worker_logging_root)
250252

251-
logger.info(f"Jobs completed. Results written to: {str(output_paths.root)}")
253+
# Emit warning if any jobs failed
254+
if status["failed"] > 0:
255+
logger.warning(
256+
f"*** NOTE: There {'was' if status['failed'] == 1 else 'were'} "
257+
f"{status['failed']} failed job{'' if status['failed'] == 1 else 's'}. ***"
258+
)
259+
260+
logger.info(
261+
f"{status['finished']} of {status['total']} jobs completed successfully. "
262+
f"Results written to: {str(output_paths.root)}"
263+
)

0 commit comments

Comments
 (0)