1
1
# Inspired by: Fatima, R., Yasin, A., Liu, L., Wang, J., & Afzal, W. (2023). Retrieving arXiv, SocArXiv, and SSRN metadata for initial review screening. Information and Software Technology, 161, 107251. https://doi.org/10.1016/j.infsof.2023.107251
2
2
3
- import httpx
4
- from bs4 import BeautifulSoup
5
- from bibtexparser .bwriter import BibTexWriter
6
- from bibtexparser .bibdatabase import BibDatabase
7
- import pandas as pd
3
+ import argparse
8
4
import datetime
9
- import urllib . parse
5
+ import logging
10
6
import sys
11
- import argparse
12
- import logging
7
+ import urllib .parse
8
+
9
+ import httpx
10
+ import pandas as pd
11
+ from bibtexparser .bibdatabase import BibDatabase
12
+ from bibtexparser .bwriter import BibTexWriter
13
+ from bs4 import BeautifulSoup
13
14
14
15
MAX_RETRIES = 3
15
16
17
+
16
18
class ArXivCollector ():
17
- def __init__ (self ,
19
+ def __init__ (self ,
18
20
user_agent = "Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" ,
19
21
num_abstracts = 50 ,
20
22
arxiv_doi_prefix = "https://doi.org/10.48550" ,
21
23
default_item_type = "ARTICLE" ,
22
- verbose = False ,
24
+ verbose = False ,
23
25
mode = "bibtex" ) -> None :
24
26
self .user_agent = user_agent
25
27
self .num_abstracts = num_abstracts
26
28
self .arxiv_doi_prefix = arxiv_doi_prefix
27
29
self .default_item_type = default_item_type
28
30
self .verbose = verbose
29
- self .client = httpx .Client (headers = {"User-Agent" : self .user_agent , })
31
+ self .client = httpx .Client (headers = {"User-Agent" : self .user_agent })
30
32
self .title = datetime .datetime .now ().strftime ("%Y-%m-%d_%H:%M:%S" )
31
33
self .mode = mode
32
34
33
35
logging .basicConfig (level = logging .INFO ,
34
- force = True , handlers = [logging .StreamHandler (sys .stdout )])
36
+ force = True , handlers = [logging .StreamHandler (sys .stdout )])
35
37
36
38
# Error handling for the mode parameter
37
39
if self .mode not in ["bibtex" , "csv" ]:
@@ -55,11 +57,11 @@ def send_request(self, url, method="GET"):
55
57
else :
56
58
logging .error (f"Failed to send request after { MAX_RETRIES } attempts." )
57
59
return None
58
-
59
- def extract_text (self ,soup :BeautifulSoup ,selector ):
60
+
61
+ def extract_text (self , soup : BeautifulSoup , selector ):
60
62
try :
61
63
text = soup .select_one (selector ).getText (strip = True )
62
- except AttributeError as err :
64
+ except AttributeError :
63
65
text = None
64
66
return text
65
67
@@ -74,17 +76,18 @@ def find_data(self, soup: BeautifulSoup, keyword) -> str:
74
76
sub = datetime .datetime .strptime (sub , "%d %B, %Y" )
75
77
break
76
78
return sub , ann
77
-
78
- def parse_html (self ,response :httpx .Response ):
79
- soup = BeautifulSoup (response .content ,'html.parser' )
79
+
80
+ def parse_html (self , response : httpx .Response ):
81
+ soup = BeautifulSoup (response .content , 'html.parser' )
80
82
81
83
lis = soup .select ('li.arxiv-result' )
82
- if len (lis ) == 0 : return []
83
- for i ,li in enumerate (lis ,start = 1 ):
84
- title = self .extract_text (li ,'p.title' )
84
+ if len (lis ) == 0 :
85
+ return []
86
+ for i , li in enumerate (lis , start = 1 ):
87
+ title = self .extract_text (li , 'p.title' )
85
88
if self .verbose :
86
- print (i ,title )
87
-
89
+ print (i , title )
90
+
88
91
temp_authors = li .select ('p.authors>a' )
89
92
authors = ' AND ' .join ([', ' .join (j .getText (strip = True ).split ()[::- 1 ]) for j in temp_authors ])
90
93
@@ -94,10 +97,10 @@ def parse_html(self,response:httpx.Response):
94
97
else :
95
98
Abstract = ''
96
99
97
- extracted_text = self .extract_text (li ,'p.comments > span:nth-of-type(2)' )
100
+ extracted_text = self .extract_text (li , 'p.comments > span:nth-of-type(2)' )
98
101
note = extracted_text if extracted_text else ""
99
102
100
- sub ,ann = self .find_data (li ,'Submitted' )
103
+ sub , ann = self .find_data (li , 'Submitted' )
101
104
102
105
# Construct ID from first author's last name and year of submission
103
106
id = authors .split (',' )[0 ] + str (sub .year )
@@ -107,18 +110,18 @@ def parse_html(self,response:httpx.Response):
107
110
pdf = li .select_one ('p.list-title > span > a[href*="pdf"]' )['href' ]
108
111
except TypeError :
109
112
pdf = ""
110
-
113
+
111
114
month_abbr = ["" , "jan" , "feb" , "mar" , "apr" , "may" , "jun" , "jul" , "aug" , "sep" , "oct" , "nov" , "dec" ]
112
115
113
- yield { # BibTeX-friendly format
114
- "title" :title ,
115
- "author" :authors ,
116
- "abstract" :Abstract ,
117
- "note" :note ,
118
- "year" :str (sub .year ),
116
+ yield { # BibTeX-friendly format
117
+ "title" : title ,
118
+ "author" : authors ,
119
+ "abstract" : Abstract ,
120
+ "note" : note ,
121
+ "year" : str (sub .year ),
119
122
"month" : month_abbr [sub .month ],
120
- "doi" : f"{ self .arxiv_doi_prefix } /arXiv.{ link .split ('/' )[- 1 ]} " , # Construct the DOI from the arXiv ID
121
- "howpublished" : fr"\url{{{ pdf } }}" ,
123
+ "doi" : f"{ self .arxiv_doi_prefix } /arXiv.{ link .split ('/' )[- 1 ]} " , # Construct the DOI from the arXiv ID
124
+ "howpublished" : fr"\url{{{ pdf } }}" ,
122
125
"ENTRYTYPE" : self .default_item_type ,
123
126
"ID" : id
124
127
}
@@ -130,10 +133,10 @@ def run(self, url):
130
133
# Parse the URL and its parameters
131
134
parsed_url = urllib .parse .urlparse (url )
132
135
params = urllib .parse .parse_qs (parsed_url .query )
133
-
136
+
134
137
# Update the 'start' parameter
135
- params ['start' ] = [page * self .num_abstracts ]
136
-
138
+ params ['start' ] = [page * self .num_abstracts ]
139
+
137
140
# Construct the new URL
138
141
new_query = urllib .parse .urlencode (params , doseq = True )
139
142
if 'advanced' not in params :
@@ -143,25 +146,27 @@ def run(self, url):
143
146
results = list (self .parse_html (res ))
144
147
self .mainLIST .extend (results )
145
148
logging .info (f"Scraped abstracts { page * self .num_abstracts } - { len (self .mainLIST )} " )
146
-
149
+
147
150
if self .mode == 'bibtex' :
148
151
# Create a BibDatabase
149
152
db = BibDatabase ()
150
153
db .entries = self .mainLIST
151
-
154
+
152
155
# Write the BibDatabase to a BibTeX file
153
156
writer = BibTexWriter ()
154
157
with open (f'{ self .title } .bib' , 'w' ) as bibfile :
155
158
bibfile .write (writer .write (db ))
156
159
elif self .mode == 'csv' :
157
160
# Convert the list of dictionaries to a DataFrame
158
161
df = pd .DataFrame (self .mainLIST )
159
-
162
+
160
163
# Write the DataFrame to a CSV file
161
164
df .to_csv (f'{ self .title } .csv' , index = False )
162
165
163
166
page += 1
164
- if len (results ) < self .num_abstracts : break
167
+ if len (results ) < self .num_abstracts :
168
+ break
169
+
165
170
166
171
def main ():
167
172
parser = argparse .ArgumentParser (description = 'Retrieve arXiv metadata.' )
@@ -175,5 +180,6 @@ def main():
175
180
arxiv .set_mode (args .mode )
176
181
arxiv .run (args .url )
177
182
183
+
178
184
if __name__ == '__main__' :
179
- main ()
185
+ main ()
0 commit comments