Skip to content

Commit ab12fab

Browse files
committed
Clean according to flake8
1 parent 47a4039 commit ab12fab

File tree

1 file changed

+48
-42
lines changed

1 file changed

+48
-42
lines changed

arxivcollector.py

Lines changed: 48 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,39 @@
11
# Inspired by: Fatima, R., Yasin, A., Liu, L., Wang, J., & Afzal, W. (2023). Retrieving arXiv, SocArXiv, and SSRN metadata for initial review screening. Information and Software Technology, 161, 107251. https://doi.org/10.1016/j.infsof.2023.107251
22

3-
import httpx
4-
from bs4 import BeautifulSoup
5-
from bibtexparser.bwriter import BibTexWriter
6-
from bibtexparser.bibdatabase import BibDatabase
7-
import pandas as pd
3+
import argparse
84
import datetime
9-
import urllib.parse
5+
import logging
106
import sys
11-
import argparse
12-
import logging
7+
import urllib.parse
8+
9+
import httpx
10+
import pandas as pd
11+
from bibtexparser.bibdatabase import BibDatabase
12+
from bibtexparser.bwriter import BibTexWriter
13+
from bs4 import BeautifulSoup
1314

1415
MAX_RETRIES = 3
1516

17+
1618
class ArXivCollector():
17-
def __init__(self,
19+
def __init__(self,
1820
user_agent="Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
1921
num_abstracts=50,
2022
arxiv_doi_prefix="https://doi.org/10.48550",
2123
default_item_type="ARTICLE",
22-
verbose=False,
24+
verbose=False,
2325
mode="bibtex") -> None:
2426
self.user_agent = user_agent
2527
self.num_abstracts = num_abstracts
2628
self.arxiv_doi_prefix = arxiv_doi_prefix
2729
self.default_item_type = default_item_type
2830
self.verbose = verbose
29-
self.client = httpx.Client(headers={"User-Agent": self.user_agent,})
31+
self.client = httpx.Client(headers={"User-Agent": self.user_agent})
3032
self.title = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
3133
self.mode = mode
3234

3335
logging.basicConfig(level=logging.INFO,
34-
force = True, handlers=[logging.StreamHandler(sys.stdout)])
36+
force=True, handlers=[logging.StreamHandler(sys.stdout)])
3537

3638
# Error handling for the mode parameter
3739
if self.mode not in ["bibtex", "csv"]:
@@ -55,11 +57,11 @@ def send_request(self, url, method="GET"):
5557
else:
5658
logging.error(f"Failed to send request after {MAX_RETRIES} attempts.")
5759
return None
58-
59-
def extract_text(self,soup:BeautifulSoup,selector):
60+
61+
def extract_text(self, soup: BeautifulSoup, selector):
6062
try:
6163
text = soup.select_one(selector).getText(strip=True)
62-
except AttributeError as err:
64+
except AttributeError:
6365
text = None
6466
return text
6567

@@ -74,17 +76,18 @@ def find_data(self, soup: BeautifulSoup, keyword) -> str:
7476
sub = datetime.datetime.strptime(sub, "%d %B, %Y")
7577
break
7678
return sub, ann
77-
78-
def parse_html(self,response:httpx.Response):
79-
soup = BeautifulSoup(response.content,'html.parser')
79+
80+
def parse_html(self, response: httpx.Response):
81+
soup = BeautifulSoup(response.content, 'html.parser')
8082

8183
lis = soup.select('li.arxiv-result')
82-
if len(lis) == 0: return []
83-
for i,li in enumerate(lis,start=1):
84-
title =self.extract_text(li,'p.title')
84+
if len(lis) == 0:
85+
return []
86+
for i, li in enumerate(lis, start=1):
87+
title = self.extract_text(li, 'p.title')
8588
if self.verbose:
86-
print(i,title)
87-
89+
print(i, title)
90+
8891
temp_authors = li.select('p.authors>a')
8992
authors = ' AND '.join([', '.join(j.getText(strip=True).split()[::-1]) for j in temp_authors])
9093

@@ -94,10 +97,10 @@ def parse_html(self,response:httpx.Response):
9497
else:
9598
Abstract = ''
9699

97-
extracted_text = self.extract_text(li,'p.comments > span:nth-of-type(2)')
100+
extracted_text = self.extract_text(li, 'p.comments > span:nth-of-type(2)')
98101
note = extracted_text if extracted_text else ""
99102

100-
sub,ann = self.find_data(li,'Submitted')
103+
sub, ann = self.find_data(li, 'Submitted')
101104

102105
# Construct ID from first author's last name and year of submission
103106
id = authors.split(',')[0] + str(sub.year)
@@ -107,18 +110,18 @@ def parse_html(self,response:httpx.Response):
107110
pdf = li.select_one('p.list-title > span > a[href*="pdf"]')['href']
108111
except TypeError:
109112
pdf = ""
110-
113+
111114
month_abbr = ["", "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
112115

113-
yield { # BibTeX-friendly format
114-
"title":title,
115-
"author":authors,
116-
"abstract":Abstract,
117-
"note":note,
118-
"year":str(sub.year),
116+
yield { # BibTeX-friendly format
117+
"title": title,
118+
"author": authors,
119+
"abstract": Abstract,
120+
"note": note,
121+
"year": str(sub.year),
119122
"month": month_abbr[sub.month],
120-
"doi": f"{self.arxiv_doi_prefix}/arXiv.{link.split('/')[-1]}", # Construct the DOI from the arXiv ID
121-
"howpublished" : fr"\url{{{pdf}}}",
123+
"doi": f"{self.arxiv_doi_prefix}/arXiv.{link.split('/')[-1]}", # Construct the DOI from the arXiv ID
124+
"howpublished": fr"\url{{{pdf}}}",
122125
"ENTRYTYPE": self.default_item_type,
123126
"ID": id
124127
}
@@ -130,10 +133,10 @@ def run(self, url):
130133
# Parse the URL and its parameters
131134
parsed_url = urllib.parse.urlparse(url)
132135
params = urllib.parse.parse_qs(parsed_url.query)
133-
136+
134137
# Update the 'start' parameter
135-
params['start'] = [page*self.num_abstracts]
136-
138+
params['start'] = [page * self.num_abstracts]
139+
137140
# Construct the new URL
138141
new_query = urllib.parse.urlencode(params, doseq=True)
139142
if 'advanced' not in params:
@@ -143,25 +146,27 @@ def run(self, url):
143146
results = list(self.parse_html(res))
144147
self.mainLIST.extend(results)
145148
logging.info(f"Scraped abstracts {page*self.num_abstracts} - {len(self.mainLIST)}")
146-
149+
147150
if self.mode == 'bibtex':
148151
# Create a BibDatabase
149152
db = BibDatabase()
150153
db.entries = self.mainLIST
151-
154+
152155
# Write the BibDatabase to a BibTeX file
153156
writer = BibTexWriter()
154157
with open(f'{self.title}.bib', 'w') as bibfile:
155158
bibfile.write(writer.write(db))
156159
elif self.mode == 'csv':
157160
# Convert the list of dictionaries to a DataFrame
158161
df = pd.DataFrame(self.mainLIST)
159-
162+
160163
# Write the DataFrame to a CSV file
161164
df.to_csv(f'{self.title}.csv', index=False)
162165

163166
page += 1
164-
if len(results) < self.num_abstracts: break
167+
if len(results) < self.num_abstracts:
168+
break
169+
165170

166171
def main():
167172
parser = argparse.ArgumentParser(description='Retrieve arXiv metadata.')
@@ -175,5 +180,6 @@ def main():
175180
arxiv.set_mode(args.mode)
176181
arxiv.run(args.url)
177182

183+
178184
if __name__ == '__main__':
179-
main()
185+
main()

0 commit comments

Comments
 (0)