Skip to content

Commit 2383b60

Browse files
authored
Merge pull request #358 from scholarly-python-package/develop
Releasing 1.4.4
2 parents 2e57d56 + a60090c commit 2383b60

File tree

6 files changed

+146
-52
lines changed

6 files changed

+146
-52
lines changed

scholarly/_navigator.py

Lines changed: 59 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ def __init__(self):
5555
self.logger = logging.getLogger('scholarly')
5656
self._TIMEOUT = 5
5757
self._max_retries = 5
58-
self._session = None
59-
self.pm = ProxyGenerator()
60-
self._session = self.pm.get_session()
58+
# A Navigator instance has two proxy managers, each with their session.
59+
# `pm1` manages the primary, premium proxy.
60+
# `pm2` manages the secondary, inexpensive proxy.
61+
self.pm1 = ProxyGenerator()
62+
self.pm2 = ProxyGenerator()
63+
self._session1 = self.pm1.get_session()
64+
self._session2 = self.pm2.get_session()
6165
self.got_403 = False
6266

6367

@@ -71,60 +75,83 @@ def set_timeout(self, timeout: int):
7175
if timeout >= 0:
7276
self._TIMEOUT = timeout
7377

74-
def use_proxy(self, pg: ProxyGenerator):
75-
if pg is not None:
76-
self.pm = pg
78+
def use_proxy(self, pg1: ProxyGenerator, pg2: ProxyGenerator = None):
79+
if pg1 is not None:
80+
self.pm1 = pg1
81+
82+
if pg2 is not None:
83+
self.pm2 = pg2
7784
else:
78-
self.pm = ProxyGenerator()
79-
self._session = self.pm.get_session()
85+
self.pm2 = ProxyGenerator()
86+
proxy_works = self.pm2.FreeProxies()
87+
if not proxy_works:
88+
self.logger.info("FreeProxy as a secondary proxy is not working. "
89+
"Using the primary proxy for all requests")
90+
self.pm2 = pg1
91+
92+
self._session1 = self.pm1.get_session()
93+
self._session2 = self.pm2.get_session()
8094

81-
def _new_session(self):
95+
def _new_session(self, premium=True):
8296
self.got_403 = False
83-
self._session = self.pm._new_session()
97+
if premium:
98+
self._session1 = self.pm1._new_session()
99+
else:
100+
self._session2 = self.pm2._new_session()
84101

85102

86-
def _get_page(self, pagerequest: str) -> str:
103+
def _get_page(self, pagerequest: str, premium: bool = False) -> str:
87104
"""Return the data from a webpage
88105
89106
:param pagerequest: the page url
90107
:type pagerequest: str
108+
:param premium: whether or not to use the premium proxy right away
109+
:type premium: bool
91110
:returns: the text from a webpage
92111
:rtype: {str}
93112
:raises: MaxTriesExceededException, DOSException
94113
"""
95114
self.logger.info("Getting %s", pagerequest)
96115
resp = None
97116
tries = 0
98-
if self.pm._use_scraperapi:
117+
if ("citations?" in pagerequest) and (not premium):
118+
pm = self.pm2
119+
session = self._session2
120+
premium = False
121+
else:
122+
pm = self.pm1
123+
session = self._session1
124+
premium = True
125+
if pm._use_scraperapi:
99126
self.set_timeout(60)
100127
timeout=self._TIMEOUT
101128
while tries < self._max_retries:
102129
try:
103130
w = random.uniform(1,2)
104131
time.sleep(w)
105-
resp = self._session.get(pagerequest, timeout=timeout)
106-
self.logger.debug("Session proxy config is {}".format(self._session.proxies))
132+
resp = session.get(pagerequest, timeout=timeout)
133+
self.logger.debug("Session proxy config is {}".format(session.proxies))
107134

108135
has_captcha = self._requests_has_captcha(resp.text)
109136

110137
if resp.status_code == 200 and not has_captcha:
111138
return resp.text
112139
elif has_captcha:
113140
self.logger.info("Got a captcha request.")
114-
self._session = self.pm._handle_captcha2(pagerequest)
115-
continue # Retry request within same session
141+
session = pm._handle_captcha2(pagerequest)
142+
continue # Retry request within same session
116143
elif resp.status_code == 403:
117-
self.logger.info(f"Got an access denied error (403).")
118-
if not self.pm.has_proxy():
144+
self.logger.info("Got an access denied error (403).")
145+
if not pm.has_proxy():
119146
self.logger.info("No other connections possible.")
120147
if not self.got_403:
121148
self.logger.info("Retrying immediately with another session.")
122149
else:
123-
if not self.pm._use_luminati:
150+
if not pm._use_luminati:
124151
w = random.uniform(60, 2*60)
125152
self.logger.info("Will retry after {} seconds (with another session).".format(w))
126153
time.sleep(w)
127-
self._new_session()
154+
self._new_session(premium=premium)
128155
self.got_403 = True
129156

130157
continue # Retry request within same session
@@ -135,7 +162,7 @@ def _get_page(self, pagerequest: str) -> str:
135162
Retrying...""")
136163

137164
except DOSException:
138-
if not self.pm.has_proxy():
165+
if not pm.has_proxy():
139166
self.logger.info("No other connections possible.")
140167
w = random.uniform(60, 2*60)
141168
self.logger.info("Will retry after {} seconds (with the same session).".format(w))
@@ -155,8 +182,13 @@ def _get_page(self, pagerequest: str) -> str:
155182
self.logger.info("Retrying with a new session.")
156183

157184
tries += 1
158-
self._session, timeout = self.pm.get_next_proxy(num_tries = tries, old_timeout = timeout)
159-
raise MaxTriesExceededException("Cannot Fetch from Google Scholar.")
185+
session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout)
186+
187+
# If secondary proxy does not work, try again primary proxy.
188+
if not premium:
189+
return self._get_page(pagerequest, True)
190+
else:
191+
raise MaxTriesExceededException("Cannot Fetch from Google Scholar.")
160192

161193

162194
def _set_retries(self, num_retries: int) -> None:
@@ -178,15 +210,16 @@ def _requests_has_captcha(self, text) -> bool:
178210
lambda c : f'class="{c}"' in text,
179211
)
180212

181-
def _webdriver_has_captcha(self) -> bool:
213+
def _webdriver_has_captcha(self, premium=True) -> bool:
182214
"""Tests whether the current webdriver page contains a captcha.
183215
184216
:returns: whether or not the site contains a captcha
185217
:rtype: {bool}
186218
"""
219+
pm = self.pm1 if premium else self.pm2
187220
return self._has_captcha(
188-
lambda i : len(self.pm._get_webdriver().find_elements(By.ID, i)) > 0,
189-
lambda c : len(self.pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0,
221+
lambda i : len(pm._get_webdriver().find_elements(By.ID, i)) > 0,
222+
lambda c : len(pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0,
190223
)
191224

192225
def _has_captcha(self, got_id, got_class) -> bool:

scholarly/_proxy_generator.py

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,22 @@ def __del__(self):
6262
def get_session(self):
6363
return self._session
6464

65-
def Luminati(self, usr , passwd, proxy_port):
65+
def Luminati(self, usr , passwd, proxy_port, skip_checking_proxy=False):
6666
""" Setups a luminati proxy without refreshing capabilities.
6767
68+
Note: ``skip_checking_proxy`` is meant to be set to `True` only in
69+
unit tests. Applications using this library must always use the default
70+
value of `False`.
71+
6872
:param usr: scholarly username, optional by default None
6973
:type usr: string
7074
:param passwd: scholarly password, optional by default None
7175
:type passwd: string
7276
:param proxy_port: port for the proxy,optional by default None
7377
:type proxy_port: integer
78+
:param skip_checking_proxy: skip checking if the proxy works,
79+
optional by default False
80+
:type skip_checking_proxy: bool
7481
:returns: whether or not the proxy was set up successfully
7582
:rtype: {bool}
7683
@@ -83,28 +90,36 @@ def Luminati(self, usr , passwd, proxy_port):
8390
password = passwd
8491
port = proxy_port
8592
else:
86-
self.logger.info("Not enough parameters were provided for the Luminati proxy. Reverting to a local connection.")
93+
self.logger.warning("Not enough parameters were provided for the Luminati proxy. Reverting to a local connection.")
8794
return
8895
session_id = random.random()
8996
proxy = f"http://{username}-session-{session_id}:{password}@zproxy.lum-superproxy.io:{port}"
90-
proxy_works = self._use_proxy(http=proxy, https=proxy)
97+
proxy_works = self._use_proxy(http=proxy, https=proxy, skip_checking_proxy=skip_checking_proxy)
9198
return proxy_works
9299

93-
def SingleProxy(self, http = None, https = None):
100+
def SingleProxy(self, http = None, https = None, skip_checking_proxy=False):
94101
"""
95102
Use proxy of your choice
103+
104+
Note: ``skip_checking_proxy`` is meant to be set to `True` only in
105+
unit tests. Applications using this library must always use the default
106+
value of `False`.
107+
96108
:param http: http proxy address
97109
type http: string
98110
:param https: https proxy adress
99111
:type https: string
112+
:param skip_checking_proxy: skip checking if the proxy works,
113+
optional by default False
114+
:type skip_checking_proxy: bool
100115
:returns: whether or not the proxy was set up successfully
101116
:rtype: {bool}
102117
103118
:Example::
104119
pg = ProxyGenerator()
105120
success = pg.SingleProxy(http = <http proxy adress>, https = <https proxy adress>)
106121
"""
107-
proxy_works = self._use_proxy(http=http,https=https)
122+
proxy_works = self._use_proxy(http=http, https=https, skip_checking_proxy=skip_checking_proxy)
108123
return proxy_works
109124

110125
def _check_proxy(self, proxies) -> bool:
@@ -152,25 +167,33 @@ def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool:
152167
self.logger.info(err)
153168
return (False, None)
154169

155-
def _use_proxy(self, http: str, https: str = None) -> bool:
170+
def _use_proxy(self, http: str, https: str = None, skip_checking_proxy: bool = False) -> bool:
156171
"""Allows user to set their own proxy for the connection session.
157172
Sets the proxy, and checks if it works.
158173
159174
:param http: the http proxy
160175
:type http: str
161176
:param https: the https proxy (default to the same as http)
162177
:type https: str
178+
:param skip_checking_proxy: Skip checking if the proxy works (defaults to False)
179+
:type skip_checking_proxy: bool
163180
:returns: whether or not the proxy was set up successfully
164181
:rtype: {bool}
165182
"""
166183
if https is None:
167184
https = http
168185

169186
proxies = {'http': http, 'https': https}
170-
self._proxy_works = self._check_proxy(proxies)
187+
if skip_checking_proxy:
188+
self._proxy_works = True
189+
else:
190+
self._proxy_works = self._check_proxy(proxies)
171191
# check if the proxy url contains luminati or scraperapi
172-
has_luminati = (True if "lum" in http else False)
173-
has_scraperapi = (True if "scraperapi" in http else False)
192+
if http is not None:
193+
has_luminati = (True if "lum" in http else False)
194+
has_scraperapi = (True if "scraperapi" in http else False)
195+
else:
196+
has_luminati, has_scraperapi = False, False
174197
if self._proxy_works:
175198
if has_luminati:
176199
self.logger.info("Enabling Luminati proxy")
@@ -412,30 +435,42 @@ def _close_session(self):
412435
if self._webdriver:
413436
self._webdriver.quit()
414437

415-
def FreeProxies(self):
438+
def FreeProxies(self, timeout=1):
416439
"""
417440
Sets up a proxy from the free-proxy library
418441
442+
:param timeout: Timeout for the proxy in seconds, optional
443+
:type timeout: float
419444
:returns: whether or not the proxy was set up successfully
420445
:rtype: {bool}
421446
422447
:Example::
423448
pg = ProxyGenerator()
424449
success = pg.FreeProxies()
425450
"""
426-
while True:
427-
proxy = FreeProxy(rand=True, timeout=1).get()
451+
freeproxy = FreeProxy(rand=True, timeout=timeout)
452+
# Looping it 60000 times gives us a 85% chance that we try each proxy
453+
# at least once.
454+
for _ in range(60000):
455+
proxy = freeproxy.get()
428456
proxy_works = self._use_proxy(http=proxy, https=proxy)
429457
if proxy_works:
430458
return proxy_works
431459

432-
def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
460+
self.logger.info("None of the free proxies are working at the moment. "
461+
"Try again after a few minutes.")
462+
463+
def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False, skip_checking_proxy=False):
433464
"""
434465
Sets up a proxy using ScraperAPI
435466
436467
The optional parameters are only for Business and Enterprise plans with
437468
ScraperAPI. For more details, https://www.scraperapi.com/documentation/
438469
470+
Note: ``skip_checking_proxy`` is meant to be set to `True` only in
471+
unit tests. Applications using this library must always use the default
472+
value of `False`.
473+
439474
:Example::
440475
pg = ProxyGenerator()
441476
success = pg.ScraperAPI(API_KEY)
@@ -445,6 +480,9 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
445480
:type country_code: string, optional by default None
446481
:type premium: bool, optional by default False
447482
:type render: bool, optional by default False
483+
:param skip_checking_proxy: skip checking if the proxy works,
484+
optional by default False
485+
:type skip_checking_proxy: bool
448486
:returns: whether or not the proxy was set up successfully
449487
:rtype: {bool}
450488
"""
@@ -460,9 +498,6 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
460498
r["requestLimit"] = int(r["requestLimit"])
461499
self.logger.info("Successful ScraperAPI requests %d / %d",
462500
r["requestCount"], r["requestLimit"])
463-
if r["requestCount"] == r["requestLimit"]:
464-
self.logger.warning("ScraperAPI account limit reached.")
465-
return False
466501

467502
# ScraperAPI documentation recommends setting the timeout to 60 seconds
468503
# so it has had a chance to try out all the retries.
@@ -478,12 +513,17 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
478513
prefix += ".render=true"
479514

480515
for _ in range(3):
481-
proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001')
516+
proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001',
517+
skip_checking_proxy=skip_checking_proxy)
482518
if proxy_works:
483519
return proxy_works
484520

485-
self.logger.warning("ScraperAPI does not seem to work")
486-
return proxy_works
521+
if (r["requestCount"] >= r["requestLimit"]):
522+
self.logger.warning("ScraperAPI account limit reached.")
523+
else:
524+
self.logger.warning("ScraperAPI does not seem to work. Reason unknown.")
525+
526+
return False
487527

488528
def has_proxy(self)-> bool:
489529
return self._proxy_gen or self._can_refresh_tor

scholarly/_scholarly.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,32 @@ def set_retries(self, num_retries: int)->None:
3737
return self.__nav._set_retries(num_retries)
3838

3939

40-
def use_proxy(self, proxy_generator: ProxyGenerator)->None:
40+
def use_proxy(self, proxy_generator: ProxyGenerator,
41+
secondary_proxy_generator: ProxyGenerator = None) -> None:
4142
"""Select which proxy method to use.
43+
4244
See the available ProxyGenerator methods.
4345
44-
:param proxy_generator: proxy generator objects
46+
This is used to get some pages that have strong anti-bot prevention.
47+
``secondary_proxy_generator`` is used for other pages that do not have
48+
a strong anti-bot prevention. If not set, free proxies are used.
49+
50+
:param proxy_generator: a proxy generator object, typically setup with
51+
a premium proxy service (ScraperAPI or Luminati)
52+
:type proxy_generator: ProxyGenerator
53+
:param proxy_generator: a second proxy generator object, optional
4554
:type proxy_generator: ProxyGenerator
55+
56+
:Example::
57+
58+
.. testcode::
59+
60+
pg = ProxyGenerator()
61+
pg.ScraperAPI(YOUR_SCRAPER_API_KEY)
62+
scholarly.use_proxy(pg)
63+
4664
"""
47-
self.__nav.use_proxy(proxy_generator)
65+
self.__nav.use_proxy(proxy_generator, secondary_proxy_generator)
4866

4967

5068
def set_logger(self, enable: bool):

scholarly/author_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def _get_coauthors_long(self, author):
161161
Opens the dialog box to get the complete list of coauthors.
162162
To be called by _fill_coauthors method.
163163
"""
164-
wd = self.nav.pm._get_webdriver()
164+
wd = self.nav.pm2._get_webdriver()
165165
try:
166166
wd.get(_COAUTH.format(author['scholar_id']))
167167
# Wait up to 30 seconds for the various elements to be available.

0 commit comments

Comments
 (0)