Skip to content

Commit 047bd7b

Browse files
committed
Replace dump.sh by dump.py that supports pagination, update docs
1 parent 9e8ca76 commit 047bd7b

File tree

6 files changed

+142
-32
lines changed

6 files changed

+142
-32
lines changed

README.md

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,20 @@ While you can perform backup of the Docker volumes,
176176
for larger upgrades of CWL Viewer it is recommended instead to do a JSON dump
177177
and re-load, which will force CWL Viewer to fetch and parse again.
178178

179-
The script `dump.sh` can be used for regular backups, it will store the full
180-
output of /workflows as a timestamped gzip-compressed JSON file:
181-
182-
$ ./dump.sh https://view.commonwl.org/ /var/backups/cwl
183-
/var/backups/cwl/2018-06-06T135133+0000.json.gz
179+
The script `dump.py` can be used for regular backups, it will store the full
180+
output of /workflows as one or multiple timestamped JSON files (you can use
181+
`gzip` to compress them):
182+
183+
$ python dump.py --viewer https://view.commonwl.org/ --output /var/backups --page 0 --size 100
184+
INFO:Viewer URL: https://view.commonwl.org/
185+
INFO:Output: /var/backups
186+
INFO:Dumping workflows from https://view.commonwl.org/, page 0, size 100 to /var/backups
187+
188+
$ python dump.py -o /var/backups -a
189+
INFO:Viewer URL: https://view.commonwl.org/
190+
INFO:Output: /var/backups
191+
INFO:Dumping all the workflows from https://view.commonwl.org/ to /var/backups
192+
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [04:39<00:00, 17.49s/it]
184193

185194
The script `load.py` (requires Python 3) can be used to restore from such JSON dumps:
186195

docs/mongo-to-postgres/README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ what was done, or to modify it for other use cases. The Python script with
3232
the same name (but `.py` extension) was created based on the Notebook, to
3333
be used in the command line.
3434

35-
To install the dependencies, use `pip install -r requirements.txt
35+
To install the dependencies, use `pip install -r requirements.txt`
3636
in a virtual environment to get the dependencies to run both the
3737
Notebook and the Python script.
38+
39+
There is also a replacement for `dump.sh`, the `dump.py` script that can
40+
paginate the requests to retrieve the complete database in a single command.

docs/mongo-to-postgres/mongo_to_pg.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import numpy as np
1212
import pandas as pd
13-
from tqdm import tqdm
1413

1514

1615
def _to_camel_case(snake_str):
@@ -62,7 +61,7 @@ def mongo_to_pg(file, out):
6261
# from: https://stackoverflow.com/a/68258386
6362
chunks = np.array_split(df.index, 100) # chunks of 100 rows
6463

65-
for chunk, subset in enumerate(tqdm(chunks)):
64+
for chunk, subset in enumerate(chunks):
6665
if chunk == 0: # first row
6766
df.loc[subset].to_csv(out, mode='w', index=True)
6867
else:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
jupyterlab
22
numpy
33
pandas
4-
tqdm
4+
requests==2.27.*

dump.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import argparse
2+
import logging
3+
from datetime import datetime
4+
from pathlib import Path
5+
from urllib.parse import urljoin
6+
7+
import requests
8+
from math import ceil
9+
10+
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
13+
DEFAULT_PAGE = 0
14+
DEFAULT_SIZE = 10
15+
MAX_PAGE_SIZE = 2000
16+
17+
18+
def _get_total_elements(viewer) -> int:
19+
"""
20+
We need to fetch a workflows listing to figure out how many entries we
21+
have in the database, since the API does not contain a method to count
22+
the DB entries.
23+
24+
:param viewer: CWL Viewer instance URL
25+
:return: number of total elements in the CWL Viewer instance DB
26+
"""
27+
smallest_workflow_dataset: dict = _fetch_workflows_data(viewer, 0, 1).json()
28+
return int(smallest_workflow_dataset['totalElements'])
29+
30+
31+
def _dump_all_workflows(viewer: str, output: Path) -> None:
32+
"""
33+
Dump all the workflows in the database.
34+
:param viewer: CWL Viewer instance URL
35+
:param output: Local existing directory
36+
:return: None
37+
"""
38+
total_elements = _get_total_elements(viewer)
39+
pages = ceil(total_elements / MAX_PAGE_SIZE)
40+
for page in range(0, pages):
41+
_dump_workflows(viewer, output, page, MAX_PAGE_SIZE)
42+
43+
44+
def _dump_workflows(viewer: str, output: Path, page: int, size: int) -> None:
45+
"""
46+
Dump a certain number of workflows.
47+
48+
:param viewer: CWL Viewer instance URL
49+
:param output: Local existing directory
50+
:param page: Page number (first is zero)
51+
:param size: Number of elements to retrieve
52+
:return: None
53+
"""
54+
response = _fetch_workflows_data(viewer, page, size)
55+
file_name = f'{datetime.now().strftime("%Y-%m-%dT%H%M%S%z")}.json'
56+
file_output = output / file_name
57+
logger.debug(f'Dumping page {page}, size {size}, to {file_output}')
58+
with file_output.open('w', encoding='utf-8') as f:
59+
f.write(response.text)
60+
61+
62+
def _fetch_workflows_data(viewer: str, page: int, size: int) -> requests.Response:
63+
"""
64+
Fetch data for workflows. Returned object is the ``requests.Response`` object returned.
65+
66+
This can be turned into JSON with a simple ``response.json()``, or to text via ``.text()``.
67+
:param viewer: CWL Viewer instance URL
68+
:param page: Page number (first is zero)
69+
:param size: Number of elements to retrieve
70+
:return: ``requests.Response`` instance
71+
"""
72+
logger.debug(f'Fetching page {page}, size {size}')
73+
url = urljoin(viewer, f'/workflows?page={page}&size={size}')
74+
logger.debug(f'URL: {url}')
75+
response = requests.get(url, headers={
76+
'accept': 'application/json'
77+
})
78+
return response
79+
80+
81+
def main():
82+
parser = argparse.ArgumentParser()
83+
parser.add_argument("-v", "--viewer", help="server base URL", default='https://view.commonwl.org/')
84+
parser.add_argument("-o", "--output", help="output directory", required=True)
85+
parser.add_argument("-p", "--page", help="what workflows page to retrieve", type=int, default=0)
86+
parser.add_argument("-s", "--size", help="how many workflows to retrieve (capped at 2000)", type=int, default=10)
87+
parser.add_argument("-a", "--all", help="dump all the workflows", action='store_true')
88+
parser.add_argument("-d", "--debug", help="set logging level to debug", action='store_true')
89+
args = parser.parse_args()
90+
if args.all and (args.page > 0 or args.size != 10):
91+
raise ValueError('You must not specify page or size with all.')
92+
if args.page < 0:
93+
raise ValueError('Page must be 0 or greater.')
94+
if args.size < 1:
95+
raise ValueError('Size must be at least 1.')
96+
if args.size > MAX_PAGE_SIZE:
97+
raise ValueError(f'Size must not be greater than {MAX_PAGE_SIZE}')
98+
out_path = Path(args.output)
99+
if not out_path.exists() or not out_path.is_dir():
100+
raise ValueError(f'Invalid output directory (not a directory, or does not exist): {args.output}')
101+
if args.debug:
102+
logger.setLevel(logging.DEBUG)
103+
logger.info(f'Viewer URL: {args.viewer}')
104+
logger.info(f'Output: {args.output}')
105+
if args.all:
106+
logger.info(f'Dumping all the workflows from {args.viewer} to {out_path}')
107+
_dump_all_workflows(
108+
viewer=args.viewer,
109+
output=out_path
110+
)
111+
else:
112+
logger.info(f'Dumping workflows from {args.viewer}, page {args.page}, size {args.size} to {out_path}')
113+
_dump_workflows(
114+
viewer=args.viewer,
115+
output=out_path,
116+
page=args.page,
117+
size=args.size
118+
)
119+
120+
121+
if __name__ == '__main__':
122+
main()

dump.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

0 commit comments

Comments
 (0)