Merge pull request #358 from scholarly-python-package/develop

ipeirotis · web-flow · commit 2383b608088a · 2021-10-25T16:12:40.000-04:00
Releasing 1.4.4
diff --git a/scholarly/_navigator.py b/scholarly/_navigator.py
@@ -55,9 +55,13 @@ def __init__(self):
         self.logger = logging.getLogger('scholarly')
         self._TIMEOUT = 5
         self._max_retries = 5
-        self._session = None
-        self.pm = ProxyGenerator()
-        self._session = self.pm.get_session()
+        # A Navigator instance has two proxy managers, each with their session.
+        # `pm1` manages the primary, premium proxy.
+        # `pm2` manages the secondary, inexpensive proxy.
+        self.pm1 = ProxyGenerator()
+        self.pm2 = ProxyGenerator()
+        self._session1 = self.pm1.get_session()
+        self._session2 = self.pm2.get_session()
         self.got_403 = False
 
 
@@ -71,60 +75,83 @@ def set_timeout(self, timeout: int):
         if timeout >= 0:
             self._TIMEOUT = timeout
 
-    def use_proxy(self, pg: ProxyGenerator):
-        if pg is not None:
-            self.pm = pg
+    def use_proxy(self, pg1: ProxyGenerator, pg2: ProxyGenerator = None):
+        if pg1 is not None:
+            self.pm1 = pg1
+
+        if pg2 is not None:
+            self.pm2 = pg2
         else:
-            self.pm = ProxyGenerator()
-        self._session = self.pm.get_session()
+            self.pm2 = ProxyGenerator()
+            proxy_works = self.pm2.FreeProxies()
+            if not proxy_works:
+                self.logger.info("FreeProxy as a secondary proxy is not working. "
+                                 "Using the primary proxy for all requests")
+                self.pm2 = pg1
+
+        self._session1 = self.pm1.get_session()
+        self._session2 = self.pm2.get_session()
 
-    def _new_session(self):
+    def _new_session(self, premium=True):
         self.got_403 = False
-        self._session = self.pm._new_session()
+        if premium:
+            self._session1 = self.pm1._new_session()
+        else:
+            self._session2 = self.pm2._new_session()
 
 
-    def _get_page(self, pagerequest: str) -> str:
+    def _get_page(self, pagerequest: str, premium: bool = False) -> str:
         """Return the data from a webpage
 
         :param pagerequest: the page url
         :type pagerequest: str
+        :param premium: whether or not to use the premium proxy right away
+        :type premium: bool
         :returns: the text from a webpage
         :rtype: {str}
         :raises: MaxTriesExceededException, DOSException
         """
         self.logger.info("Getting %s", pagerequest)
         resp = None
         tries = 0
-        if self.pm._use_scraperapi:
+        if ("citations?" in pagerequest) and (not premium):
+            pm = self.pm2
+            session = self._session2
+            premium = False
+        else:
+            pm = self.pm1
+            session = self._session1
+            premium = True
+        if pm._use_scraperapi:
             self.set_timeout(60)
         timeout=self._TIMEOUT
         while tries < self._max_retries:
             try:
                 w = random.uniform(1,2)
                 time.sleep(w)
-                resp = self._session.get(pagerequest, timeout=timeout)
-                self.logger.debug("Session proxy config is {}".format(self._session.proxies))
+                resp = session.get(pagerequest, timeout=timeout)
+                self.logger.debug("Session proxy config is {}".format(session.proxies))
 
                 has_captcha = self._requests_has_captcha(resp.text)
 
                 if resp.status_code == 200 and not has_captcha:
                     return resp.text
                 elif has_captcha:
                     self.logger.info("Got a captcha request.")
-                    self._session = self.pm._handle_captcha2(pagerequest)
-                    continue # Retry request within same session
+                    session = pm._handle_captcha2(pagerequest)
+                    continue  # Retry request within same session
                 elif resp.status_code == 403:
-                    self.logger.info(f"Got an access denied error (403).")
-                    if not self.pm.has_proxy():
+                    self.logger.info("Got an access denied error (403).")
+                    if not pm.has_proxy():
                         self.logger.info("No other connections possible.")
                         if not self.got_403:
                             self.logger.info("Retrying immediately with another session.")
                         else:
-                            if not self.pm._use_luminati:
+                            if not pm._use_luminati:
                                 w = random.uniform(60, 2*60)
                                 self.logger.info("Will retry after {} seconds (with another session).".format(w))
                                 time.sleep(w)
-                        self._new_session()
+                        self._new_session(premium=premium)
                         self.got_403 = True
 
                         continue # Retry request within same session
@@ -135,7 +162,7 @@ def _get_page(self, pagerequest: str) -> str:
                                     Retrying...""")
 
             except DOSException:
-                if not self.pm.has_proxy():
+                if not pm.has_proxy():
                     self.logger.info("No other connections possible.")
                     w = random.uniform(60, 2*60)
                     self.logger.info("Will retry after {} seconds (with the same session).".format(w))
@@ -155,8 +182,13 @@ def _get_page(self, pagerequest: str) -> str:
                 self.logger.info("Retrying with a new session.")
 
             tries += 1
-            self._session, timeout = self.pm.get_next_proxy(num_tries = tries, old_timeout = timeout)
-        raise MaxTriesExceededException("Cannot Fetch from Google Scholar.")
+            session, timeout = pm.get_next_proxy(num_tries = tries, old_timeout = timeout)
+
+        # If secondary proxy does not work, try again primary proxy.
+        if not premium:
+            return self._get_page(pagerequest, True)
+        else:
+            raise MaxTriesExceededException("Cannot Fetch from Google Scholar.")
 
 
     def _set_retries(self, num_retries: int) -> None:
@@ -178,15 +210,16 @@ def _requests_has_captcha(self, text) -> bool:
             lambda c : f'class="{c}"' in text,
         )
 
-    def _webdriver_has_captcha(self) -> bool:
+    def _webdriver_has_captcha(self, premium=True) -> bool:
         """Tests whether the current webdriver page contains a captcha.
 
         :returns: whether or not the site contains a captcha
         :rtype: {bool}
         """
+        pm = self.pm1 if premium else self.pm2
         return self._has_captcha(
-            lambda i : len(self.pm._get_webdriver().find_elements(By.ID, i)) > 0,
-            lambda c : len(self.pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0,
+            lambda i : len(pm._get_webdriver().find_elements(By.ID, i)) > 0,
+            lambda c : len(pm._get_webdriver().find_elements(By.CLASS_NAME, c)) > 0,
         )
 
     def _has_captcha(self, got_id, got_class) -> bool:
diff --git a/scholarly/_proxy_generator.py b/scholarly/_proxy_generator.py
@@ -62,15 +62,22 @@ def __del__(self):
     def get_session(self):
         return self._session
 
-    def Luminati(self, usr , passwd, proxy_port):
+    def Luminati(self, usr , passwd, proxy_port, skip_checking_proxy=False):
         """ Setups a luminati proxy without refreshing capabilities.
 
+        Note: ``skip_checking_proxy`` is meant to be set to `True` only in
+        unit tests. Applications using this library must always use the default
+        value of `False`.
+
         :param usr: scholarly username, optional by default None
         :type usr: string
         :param passwd: scholarly password, optional by default None
         :type passwd: string
         :param proxy_port: port for the proxy,optional by default None
         :type proxy_port: integer
+        :param skip_checking_proxy: skip checking if the proxy works,
+                                    optional by default False
+        :type skip_checking_proxy: bool
         :returns: whether or not the proxy was set up successfully
         :rtype: {bool}
 
@@ -83,28 +90,36 @@ def Luminati(self, usr , passwd, proxy_port):
             password = passwd
             port = proxy_port
         else:
-            self.logger.info("Not enough parameters were provided for the Luminati proxy. Reverting to a local connection.")
+            self.logger.warning("Not enough parameters were provided for the Luminati proxy. Reverting to a local connection.")
             return
         session_id = random.random()
         proxy = f"http://{username}-session-{session_id}:{password}@zproxy.lum-superproxy.io:{port}"
-        proxy_works = self._use_proxy(http=proxy, https=proxy)
+        proxy_works = self._use_proxy(http=proxy, https=proxy, skip_checking_proxy=skip_checking_proxy)
         return proxy_works
 
-    def SingleProxy(self, http = None, https = None):
+    def SingleProxy(self, http = None, https = None, skip_checking_proxy=False):
         """
         Use proxy of your choice
+
+        Note: ``skip_checking_proxy`` is meant to be set to `True` only in
+        unit tests. Applications using this library must always use the default
+        value of `False`.
+
         :param http: http proxy address
         type http: string
         :param https: https proxy adress
         :type https: string
+        :param skip_checking_proxy: skip checking if the proxy works,
+                                    optional by default False
+        :type skip_checking_proxy: bool
         :returns: whether or not the proxy was set up successfully
         :rtype: {bool}
 
         :Example::
             pg = ProxyGenerator()
             success = pg.SingleProxy(http = <http proxy adress>, https = <https proxy adress>)
         """
-        proxy_works = self._use_proxy(http=http,https=https)
+        proxy_works = self._use_proxy(http=http, https=https, skip_checking_proxy=skip_checking_proxy)
         return proxy_works
 
     def _check_proxy(self, proxies) -> bool:
@@ -152,25 +167,33 @@ def _refresh_tor_id(self, tor_control_port: int, password: str) -> bool:
             self.logger.info(err)
             return (False, None)
 
-    def _use_proxy(self, http: str, https: str = None) -> bool:
+    def _use_proxy(self, http: str, https: str = None, skip_checking_proxy: bool = False) -> bool:
         """Allows user to set their own proxy for the connection session.
         Sets the proxy, and checks if it works.
 
         :param http: the http proxy
         :type http: str
         :param https: the https proxy (default to the same as http)
         :type https: str
+        :param skip_checking_proxy: Skip checking if the proxy works (defaults to False)
+        :type skip_checking_proxy: bool
         :returns: whether or not the proxy was set up successfully
         :rtype: {bool}
         """
         if https is None:
             https = http
 
         proxies = {'http': http, 'https': https}
-        self._proxy_works = self._check_proxy(proxies)
+        if skip_checking_proxy:
+            self._proxy_works = True
+        else:
+            self._proxy_works = self._check_proxy(proxies)
         # check if the proxy url contains luminati or scraperapi
-        has_luminati = (True if "lum" in http else False)
-        has_scraperapi = (True if "scraperapi" in http else False)
+        if http is not None:
+            has_luminati = (True if "lum" in http else False)
+            has_scraperapi = (True if "scraperapi" in http else False)
+        else:
+            has_luminati, has_scraperapi = False, False
         if self._proxy_works:
             if has_luminati:
                 self.logger.info("Enabling Luminati proxy")
@@ -412,30 +435,42 @@ def _close_session(self):
         if self._webdriver:
             self._webdriver.quit()
 
-    def FreeProxies(self):
+    def FreeProxies(self, timeout=1):
         """
         Sets up a proxy from the free-proxy library
 
+        :param timeout: Timeout for the proxy in seconds, optional
+        :type timeout: float
         :returns: whether or not the proxy was set up successfully
         :rtype: {bool}
 
         :Example::
             pg = ProxyGenerator()
             success = pg.FreeProxies()
         """
-        while True:
-            proxy = FreeProxy(rand=True, timeout=1).get()
+        freeproxy = FreeProxy(rand=True, timeout=timeout)
+        # Looping it 60000 times gives us a 85% chance that we try each proxy
+        # at least once.
+        for _ in range(60000):
+            proxy = freeproxy.get()
             proxy_works = self._use_proxy(http=proxy, https=proxy)
             if proxy_works:
                 return proxy_works
 
-    def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
+        self.logger.info("None of the free proxies are working at the moment. "
+                         "Try again after a few minutes.")
+
+    def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False, skip_checking_proxy=False):
         """
         Sets up a proxy using ScraperAPI
 
         The optional parameters are only for Business and Enterprise plans with
         ScraperAPI. For more details, https://www.scraperapi.com/documentation/
 
+        Note: ``skip_checking_proxy`` is meant to be set to `True` only in
+        unit tests. Applications using this library must always use the default
+        value of `False`.
+
         :Example::
             pg = ProxyGenerator()
             success = pg.ScraperAPI(API_KEY)
@@ -445,6 +480,9 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
         :type country_code: string, optional by default None
         :type premium: bool, optional by default False
         :type render: bool, optional by default False
+        :param skip_checking_proxy: skip checking if the proxy works,
+                                    optional by default False
+        :type skip_checking_proxy: bool
         :returns: whether or not the proxy was set up successfully
         :rtype: {bool}
         """
@@ -460,9 +498,6 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
         r["requestLimit"] = int(r["requestLimit"])
         self.logger.info("Successful ScraperAPI requests %d / %d",
                          r["requestCount"], r["requestLimit"])
-        if r["requestCount"] == r["requestLimit"]:
-            self.logger.warning("ScraperAPI account limit reached.")
-            return False
 
         # ScraperAPI documentation recommends setting the timeout to 60 seconds
         # so it has had a chance to try out all the retries.
@@ -478,12 +513,17 @@ def ScraperAPI(self, API_KEY, country_code=None, premium=False, render=False):
             prefix += ".render=true"
 
         for _ in range(3):
-            proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001')
+            proxy_works = self._use_proxy(http=f'{prefix}:{API_KEY}@proxy-server.scraperapi.com:8001',
+                                          skip_checking_proxy=skip_checking_proxy)
             if proxy_works:
                 return proxy_works
 
-        self.logger.warning("ScraperAPI does not seem to work")
-        return proxy_works
+        if (r["requestCount"] >= r["requestLimit"]):
+            self.logger.warning("ScraperAPI account limit reached.")
+        else:
+            self.logger.warning("ScraperAPI does not seem to work. Reason unknown.")
+
+        return False
 
     def has_proxy(self)-> bool:
         return self._proxy_gen or self._can_refresh_tor
diff --git a/scholarly/_scholarly.py b/scholarly/_scholarly.py
@@ -37,14 +37,32 @@ def set_retries(self, num_retries: int)->None:
         return self.__nav._set_retries(num_retries)
 
 
-    def use_proxy(self, proxy_generator: ProxyGenerator)->None:
+    def use_proxy(self, proxy_generator: ProxyGenerator,
+                  secondary_proxy_generator: ProxyGenerator = None) -> None:
         """Select which proxy method to use.
+
         See the available ProxyGenerator methods.
 
-        :param proxy_generator: proxy generator objects
+        This is used to get some pages that have strong anti-bot prevention.
+        ``secondary_proxy_generator`` is used for other pages that do not have
+        a strong anti-bot prevention. If not set, free proxies are used.
+
+        :param proxy_generator: a proxy generator object, typically setup with
+                               a premium proxy service (ScraperAPI or Luminati)
+        :type proxy_generator: ProxyGenerator
+        :param proxy_generator: a second proxy generator object, optional
         :type proxy_generator: ProxyGenerator
+
+        :Example::
+
+        .. testcode::
+
+            pg = ProxyGenerator()
+            pg.ScraperAPI(YOUR_SCRAPER_API_KEY)
+            scholarly.use_proxy(pg)
+
         """
-        self.__nav.use_proxy(proxy_generator)
+        self.__nav.use_proxy(proxy_generator, secondary_proxy_generator)
 
 
     def set_logger(self, enable: bool):
diff --git a/scholarly/author_parser.py b/scholarly/author_parser.py
@@ -161,7 +161,7 @@ def _get_coauthors_long(self, author):
         Opens the dialog box to get the complete list of coauthors.
         To be called by _fill_coauthors method.
         """
-        wd = self.nav.pm._get_webdriver()
+        wd = self.nav.pm2._get_webdriver()
         try:
             wd.get(_COAUTH.format(author['scholar_id']))
             # Wait up to 30 seconds for the various elements to be available.
diff --git a/setup.py b/setup.py
diff --git a/test_module.py b/test_module.py