From fc23cfa58b80b733d2d05c47c126684caaf076ce Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:51:48 -0600 Subject: [PATCH 1/8] Create download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 61 ++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 pubmlst_client/download_pubmlst.py diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py new file mode 100644 index 0000000..f30ad7e --- /dev/null +++ b/pubmlst_client/download_pubmlst.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import re +import urllib.request +import sys +import time +import datetime + +from pubmlst_client.util import get + + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument("--outdir", "-o", dest="outdir", default='./mlstdb', help="output directory") + parser.add_argument("--base-url", "-b", dest="base_url", default='http://rest.pubmlst.org/db', help="Base URL for the API. Suggested values are: http://rest.pubmlst.org/db (default), https://bigsdb.pasteur.fr/api/db") + args = parser.parse_args() + + api_url_base = args.base_url + + url_base_response = json.loads(get(api_url_base)) + + if not os.path.exists(args.outdir): + os.mkdir(args.outdir) + + for db in url_base_response: + databases = db['databases'] + for database in databases: + if '_seqdef' in database['name']: + db_download_path = '%s/%s' % (args.outdir,database['name'].split('_')[1]) + os.mkdir(db_download_path) + plaintext_header = {'Content-Type': 'text/plain'} + types_tsv = get(''.join([database['href'],'/schemes/1/profiles_csv']), headers=plaintext_header).decode('utf-8') + output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '.txt') + with open(output_filename, 'w') as f: + f.write(types_tsv) + log_msg = { + 'timestamp': str(datetime.datetime.now().isoformat()), + 'event': 'file_downloaded', + 'filename': output_filename, + } + print(json.dumps(log_msg), file=sys.stderr) + db_res = json.loads(get(''.join([database['href'],'/schemes/1']))) + for locus_url in db_res['loci']: + locus = json.loads(get(locus_url)) + alleles_fasta = get(locus['alleles_fasta'], headers=plaintext_header).decode('utf-8') + output_filename = os.path.join(db_download_path, locus['id'] + '.fasta') + with open(output_filename, 'w') as f: + f.write(alleles_fasta) + log_msg = { + 'timestamp': str(datetime.datetime.now().isoformat()), + 'event': 'file_downloaded', + 'filename': output_filename, + } + print(json.dumps(log_msg), file=sys.stderr) + +if __name__ == '__main__': + main() From 7093f2f2a0d67d91c4a511eb1d13f81c0893b95a Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:38:22 -0600 Subject: [PATCH 2/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index f30ad7e..6d58c5d 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -32,8 +32,13 @@ def main(): if '_seqdef' in database['name']: db_download_path = '%s/%s' % (args.outdir,database['name'].split('_')[1]) os.mkdir(db_download_path) + # Find MLST Scheme + schemes = json.loads(get(''.join([database['href'],'/schemes']))) + for scheme in schemes['schemes']: + if scheme['description'] == 'MLST': + mlst_scheme = scheme['scheme'].split('/')[-1] plaintext_header = {'Content-Type': 'text/plain'} - types_tsv = get(''.join([database['href'],'/schemes/1/profiles_csv']), headers=plaintext_header).decode('utf-8') + types_tsv = get(''.join([database['href'],'/schemes/%s/profiles_csv' % mlst_scheme]), headers=plaintext_header).decode('utf-8') output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '.txt') with open(output_filename, 'w') as f: f.write(types_tsv) @@ -43,7 +48,7 @@ def main(): 'filename': output_filename, } print(json.dumps(log_msg), file=sys.stderr) - db_res = json.loads(get(''.join([database['href'],'/schemes/1']))) + db_res = json.loads(get(''.join([database['href'],'/schemes/%s' % mlst_scheme]))) for locus_url in db_res['loci']: locus = json.loads(get(locus_url)) alleles_fasta = get(locus['alleles_fasta'], headers=plaintext_header).decode('utf-8') From 0cca619d092b67adaaa70cb62bf70db1660e7005 Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:33:58 -0600 Subject: [PATCH 3/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 63 ++++++++++++++++++------------ 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index 6d58c5d..f10430f 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -30,37 +30,48 @@ def main(): databases = db['databases'] for database in databases: if '_seqdef' in database['name']: - db_download_path = '%s/%s' % (args.outdir,database['name'].split('_')[1]) - os.mkdir(db_download_path) - # Find MLST Scheme + db_download_path_1 = '%s/%s' % (args.outdir,database['name'].split('_')[1]) + # Find MLST Schemes schemes = json.loads(get(''.join([database['href'],'/schemes']))) + mlst_schemes = [] for scheme in schemes['schemes']: - if scheme['description'] == 'MLST': - mlst_scheme = scheme['scheme'].split('/')[-1] - plaintext_header = {'Content-Type': 'text/plain'} - types_tsv = get(''.join([database['href'],'/schemes/%s/profiles_csv' % mlst_scheme]), headers=plaintext_header).decode('utf-8') - output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '.txt') - with open(output_filename, 'w') as f: - f.write(types_tsv) - log_msg = { - 'timestamp': str(datetime.datetime.now().isoformat()), - 'event': 'file_downloaded', - 'filename': output_filename, - } - print(json.dumps(log_msg), file=sys.stderr) - db_res = json.loads(get(''.join([database['href'],'/schemes/%s' % mlst_scheme]))) - for locus_url in db_res['loci']: - locus = json.loads(get(locus_url)) - alleles_fasta = get(locus['alleles_fasta'], headers=plaintext_header).decode('utf-8') - output_filename = os.path.join(db_download_path, locus['id'] + '.fasta') + if 'MLST' in scheme['description']: + mlst_schemes.append(scheme['scheme'].split('/')[-1]) + mlst_schemes.sort() + for i in range(len(mlst_schemes)): + if i > 0: + db_download_path = db_download_path_1 + "_%s" % (i+1) + os.mkdir(db_download_path) + else: + db_download_path = db_download_path_1 + os.mkdir(db_download_path) + plaintext_header = {'Content-Type': 'text/plain'} + types_tsv = get(''.join([database['href'],'/schemes/%s/profiles_csv' % mlst_schemes[i]]), headers=plaintext_header).decode('utf-8') + if i > 0: + output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '_%s'% i +'.txt') + else: + output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '.txt') with open(output_filename, 'w') as f: - f.write(alleles_fasta) + f.write(types_tsv) log_msg = { - 'timestamp': str(datetime.datetime.now().isoformat()), - 'event': 'file_downloaded', - 'filename': output_filename, - } + 'timestamp': str(datetime.datetime.now().isoformat()), + 'event': 'file_downloaded', + 'filename': output_filename, + } print(json.dumps(log_msg), file=sys.stderr) + db_res = json.loads(get(''.join([database['href'],'/schemes/%s' % mlst_schemes[i]]))) + for locus_url in db_res['loci']: + locus = json.loads(get(locus_url)) + alleles_fasta = get(locus['alleles_fasta'], headers=plaintext_header).decode('utf-8') + output_filename = os.path.join(db_download_path, locus['id'] + '.fasta') + with open(output_filename, 'w') as f: + f.write(alleles_fasta) + log_msg = { + 'timestamp': str(datetime.datetime.now().isoformat()), + 'event': 'file_downloaded', + 'filename': output_filename, + } + print(json.dumps(log_msg), file=sys.stderr) if __name__ == '__main__': main() From c3afd71b3c94113e6e3b35b62e66866301146354 Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:49:21 -0600 Subject: [PATCH 4/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index f10430f..3052de3 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -48,7 +48,7 @@ def main(): plaintext_header = {'Content-Type': 'text/plain'} types_tsv = get(''.join([database['href'],'/schemes/%s/profiles_csv' % mlst_schemes[i]]), headers=plaintext_header).decode('utf-8') if i > 0: - output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '_%s'% i +'.txt') + output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '_%s'% (i+1) +'.txt') else: output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '.txt') with open(output_filename, 'w') as f: From 546e3fd68411119894ebf5467435e3871ce3cae6 Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Sat, 12 Oct 2024 08:56:03 -0600 Subject: [PATCH 5/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index 3052de3..c79d489 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -30,7 +30,7 @@ def main(): databases = db['databases'] for database in databases: if '_seqdef' in database['name']: - db_download_path_1 = '%s/%s' % (args.outdir,database['name'].split('_')[1]) + db_download_path_1 = '%s/%s' % (args.outdir,database['name'][8:-7]) # Find MLST Schemes schemes = json.loads(get(''.join([database['href'],'/schemes']))) mlst_schemes = [] @@ -48,9 +48,9 @@ def main(): plaintext_header = {'Content-Type': 'text/plain'} types_tsv = get(''.join([database['href'],'/schemes/%s/profiles_csv' % mlst_schemes[i]]), headers=plaintext_header).decode('utf-8') if i > 0: - output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '_%s'% (i+1) +'.txt') + output_filename = os.path.join( db_download_path , database['name'][8:-7] + '_%s'% (i+1) +'.txt') else: - output_filename = os.path.join( db_download_path , database['name'].split('_')[1] + '.txt') + output_filename = os.path.join( db_download_path , database['name'][8:-7] + '.txt') with open(output_filename, 'w') as f: f.write(types_tsv) log_msg = { From c0ea7b895b688b0844f18611fe1428b2ee2eecd1 Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Sat, 12 Oct 2024 10:43:41 -0600 Subject: [PATCH 6/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index c79d489..516e5d5 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -35,7 +35,7 @@ def main(): schemes = json.loads(get(''.join([database['href'],'/schemes']))) mlst_schemes = [] for scheme in schemes['schemes']: - if 'MLST' in scheme['description']: + if 'MLST' in scheme['description'] and 'cgMLST' not in scheme['description'] and 'eMLST' not in scheme['description'] : mlst_schemes.append(scheme['scheme'].split('/')[-1]) mlst_schemes.sort() for i in range(len(mlst_schemes)): From 223d8c5a39528a7f18d23848641d235d44d549d1 Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Sat, 12 Oct 2024 11:15:01 -0600 Subject: [PATCH 7/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index 516e5d5..3fede9c 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -39,16 +39,16 @@ def main(): mlst_schemes.append(scheme['scheme'].split('/')[-1]) mlst_schemes.sort() for i in range(len(mlst_schemes)): - if i > 0: - db_download_path = db_download_path_1 + "_%s" % (i+1) + if int(mlst_schemes[i])> 1: + db_download_path = db_download_path_1 + "_%s" % (mlst_schemes[i]) os.mkdir(db_download_path) else: db_download_path = db_download_path_1 os.mkdir(db_download_path) plaintext_header = {'Content-Type': 'text/plain'} types_tsv = get(''.join([database['href'],'/schemes/%s/profiles_csv' % mlst_schemes[i]]), headers=plaintext_header).decode('utf-8') - if i > 0: - output_filename = os.path.join( db_download_path , database['name'][8:-7] + '_%s'% (i+1) +'.txt') + if int(mlst_schemes[i])> 1: + output_filename = os.path.join( db_download_path , database['name'][8:-7] + '_%s'% (mlst_schemes[i]) +'.txt') else: output_filename = os.path.join( db_download_path , database['name'][8:-7] + '.txt') with open(output_filename, 'w') as f: From adecc5648c742d955edb339ef255a6d2cb8ad866 Mon Sep 17 00:00:00 2001 From: John Arnn <72882088+jwarnn@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:30:58 -0600 Subject: [PATCH 8/8] Update download_pubmlst.py --- pubmlst_client/download_pubmlst.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pubmlst_client/download_pubmlst.py b/pubmlst_client/download_pubmlst.py index 3fede9c..1742ba8 100644 --- a/pubmlst_client/download_pubmlst.py +++ b/pubmlst_client/download_pubmlst.py @@ -29,16 +29,23 @@ def main(): for db in url_base_response: databases = db['databases'] for database in databases: + # rmlst is a rescrited database + if 'rmlst' in database['name'] or 'test' in database['name']: + continue if '_seqdef' in database['name']: db_download_path_1 = '%s/%s' % (args.outdir,database['name'][8:-7]) # Find MLST Schemes schemes = json.loads(get(''.join([database['href'],'/schemes']))) mlst_schemes = [] for scheme in schemes['schemes']: - if 'MLST' in scheme['description'] and 'cgMLST' not in scheme['description'] and 'eMLST' not in scheme['description'] : + # The desription element is has some inconsistancies; this list is to navigate those. + if 'MLST' == scheme['description'] or "MLST" in scheme['description'].split(' ') and not "Extended MLST" == scheme['description']: + if 'MLST (Pla-Díaz)' == scheme['description'] and database['name'][8:-7] == 'tpallidum': + continue mlst_schemes.append(scheme['scheme'].split('/')[-1]) mlst_schemes.sort() for i in range(len(mlst_schemes)): + # Folders in MLST script folders are named after the organism if scheme is 1. If scheme number lager than 1 then name is organisms with sheme number appended on. if int(mlst_schemes[i])> 1: db_download_path = db_download_path_1 + "_%s" % (mlst_schemes[i]) os.mkdir(db_download_path)