-
Notifications
You must be signed in to change notification settings - Fork 97
Description
Thank you for chembl_webresource_client for its convenience, and I do not need to learn how to use SQL and REST APIs. I aim to gather all IC50 values of human cell-lines in chemical cytotoxicity experiment. After mining ChEMBL IDs of 1,483 human cell-lines, I want to collect the ChEMBL IDs of the corresponding 241,385 assays. Accelerating this process with batch chunking of cell-lines resulted in Error1 (process_assay_batch.py). It seems that the SQL assay object does not tolerate the simultaneous access to its data. I gave up batch processing and try mining with a single process. Although I succeeded in collecting assay IDs, Error2 occurred when gathering IC50 values, although I could not observe the shortage of RAM and storages (process_ic50.sh).
process_assay.py
import os
import sys
import argparse
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client
batch_size = 10
assay_total = new_client.assay
parser = argparse.ArgumentParser()
parser.add_argument("-nth", type=int, default=0)
args = parser.parse_args()
Cells = pd.read_csv("Cells.csv")
cells = Cells.cell_chembl_id.values.tolist()
idx1 = batch_size * args.nth
idx2 = batch_size * (args.nth+1)
num_cells = len(Cells)
print("### Batch {}th [{}, {}]".format(args.nth, idx1, idx2))
if idx1 > num_cells :
print("# Batch start is {}, larger than {}...".format(idx1, num_cells))
sys.exit()
idx2 = idx2 if idx2>num_cells else idx2
assay_temp = assay_total.filter(cell_chembl_id__in=cells[idx1:idx2])
Assay_Temp = pd.DataFrame(assay_temp)
Assay_Temp.to_csv("Assay_{}.csv".format(args.nth), index=False)
Error 1
Traceback (most recent call last):
File "process_assay.py", line 31, in <module>
Assay_Temp = pd.DataFrame(assay_temp)
File "[Python Directory]/python3.8/site-packages/pandas/core/frame.py", line 563, in __init__
data = list(data)
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/query_set.py", line 97, in __len__
return len(self.query)
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/url_query.py", line 144, in __len__
self.get_page()
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/url_query.py", line 389, in get_page
res = session.post(self.base_url + '.' + self.frmt, json=data, timeout=self.timeout)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests/sessions.py", line 637, in post
return self.request("POST", url, data=data, json=json, **kwargs)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/session.py", line 101, in request
return super().request(
File "[Anacoda Directory]/lib/python3.8/site-packages/requests/sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/session.py", line 132, in send
response = self._send_and_cache(request, actions, **kwargs)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/session.py", line 160, in _send_and_cache
self.cache.save_response(response, actions.cache_key, actions.expires)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/backends/base.py", line 59, in save_response
self.responses[cache_key] = cached_response
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/backends/sqlite.py", line 211, in __setitem__
super().__setitem__(key, serialized_value)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/backends/sqlite.py", line 164, in __setitem__
con.execute(
File "[Anacoda Directory]/lib/python3.8/contextlib.py", line 120, in __exit__
next(self.gen)
File "[Anacoda Directory]/lib/python3.8/site-packages/requests_cache/backends/sqlite.py", line 117, in connection
self._local_context.con.commit()
sqlite3.OperationalError: database is locked
process_ic50.py
import os
import sys
import argparse
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client
parser = argparse.ArgumentParser()
parser.add_argument("-total", type=int, default=0)
args = parser.parse_args()
# DATA_UPLOAD_MAX_MEMORY_SIZE = 100000000
activity_total = new_client.activity
Assay = pd.read_csv("Assay.csv", low_memory=False)
assay_total = Assay.assay_chembl_id.values.tolist()
activities = activity_total.filter(assay_chembl_id__in=assay_total, type="IC50")
IC50 = pd.DataFrame(activities)
IC50.to_csv("IC50.csv", index=False)
Error 2
Traceback (most recent call last):
File "process_ic50.py", line 30, in <module>
IC50 = pd.DataFrame(activities)
File "[Python Directory]/python3.8/site-packages/pandas/core/frame.py", line 563, in __init__
data = list(data)
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/query_set.py", line 97, in __len__
return len(self.query)
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/url_query.py", line 144, in __len__
self.get_page()
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/url_query.py", line 394, in get_page
handle_http_error(res)
File "[Anacoda Directory]/lib/python3.8/site-packages/chembl_webresource_client/http_errors.py", line 113, in handle_http_error
raise exception_class(request.url, request.text)
chembl_webresource_client.http_errors.HttpApplicationError: Error for url https://www.ebi.ac.uk/chembl/api/data/activity.json, server response: {"error_message": "Request body exceeded settings.DATA_UPLOAD_MAX_MEMORY_SIZE.", "traceback": "Traceback (most recent call last):\n\n File \"/chembl_ws_py3/src/chembl_webservices/core/resource.py\", line 260, in wrapper\n elif request.body:\n\n File \"/usr/local/lib/python3.9/site-packages/django/http/request.py\", line 286, in body\n raise RequestDataTooBig('Request body exceeded settings.DATA_UPLOAD_MAX_MEMORY_SIZE.')\n\ndjango.core.exceptions.RequestDataTooBig: Request body exceeded settings.DATA_UPLOAD_MAX_MEMORY_SIZE.\n"}