Merge pull request #21 from ArshansGithub/patch-1

opsdisk · web-flow · commit 985487d5906d · 2023-06-10T14:51:21.000-05:00
Add GOOGLE_ABUSE_EXEMPTION cookie
diff --git a/README.md b/README.md
@@ -242,6 +242,11 @@ for search_query in search_queries:
     proxy_rotation_index += 1
 ```
 
+## GOOGLE_ABUSE_EXEMPTION cookie
+
+If you have a `GOOGLE_ABUSE_EXEMPTION` cookie value, it can be passed into `google_exemption` when instantiating the
+`SearchClient` object.
+
 ## &tbs= URL filter clarification
 
 The `&tbs=` parameter is used to specify either verbatim or time-based filters.
@@ -291,3 +296,4 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops
 ## Contributors
 
 * [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
+* [ArshansGithub](https://github.com/ArshansGithub) - <https://github.com/opsdisk/yagooglesearch/pull/21>
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
 beautifulsoup4>=4.9.3
-requests>=2.26.0
+requests>=2.31.0
 requests[socks]
diff --git a/yagooglesearch/__init__.py b/yagooglesearch/__init__.py
@@ -12,7 +12,7 @@
 
 # Custom Python libraries.
 
-__version__ = "1.6.1"
+__version__ = "1.7.0"
 
 # Logging
 ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -86,8 +86,8 @@ def __init__(
         verify_ssl=True,
         verbosity=5,
         verbose_output=False,
+        google_exemption=None,
     ):
-
         """
         SearchClient
         :param str query: Query string.  Must NOT be url-encoded.
@@ -118,6 +118,8 @@ def __init__(
             This may need to be disabled in some HTTPS proxy instances.
         :param int verbosity: Logging and console output verbosity.
         :param bool verbose_output: False (only URLs) or True (rank, title, description, and URL).  Defaults to False.
+        :param str google_exemption: Google cookie exemption string.  This is a string that Google uses to allow certain
+            google searches. Defaults to None.
 
         :rtype: List of str
         :return: List of URLs found or list of {"rank", "title", "description", "url"}
@@ -142,6 +144,7 @@ def __init__(
         self.verify_ssl = verify_ssl
         self.verbosity = verbosity
         self.verbose_output = verbose_output
+        self.google_exemption = google_exemption
 
         # Assign log level.
         ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
@@ -151,8 +154,12 @@ def __init__(
             ROOT_LOGGER.warning("The largest value allowed by Google for num is 100.  Setting num to 100.")
             self.num = 100
 
-        # Initialize cookies to None, will be updated with each request in get_page().
-        self.cookies = None
+        # Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided.  Otherwise, initialize cookies to None.
+        # It will be updated with each request in get_page().
+        if self.google_exemption:
+            self.cookies = {"GOOGLE_ABUSE_EXEMPTION": self.google_exemption}
+        else:
+            self.cookies = None
 
         # Used later to ensure there are not any URL parameter collisions.
         self.url_parameters = (
@@ -178,7 +185,6 @@ def __init__(
 
         # Update proxy_dict if a proxy is provided.
         if proxy:
-
             # Standardize case since the scheme will be checked against a hard-coded list.
             self.proxy = proxy.lower()
 
@@ -321,7 +327,12 @@ def get_page(self, url):
 
         ROOT_LOGGER.info(f"Requesting URL: {url}")
         response = requests.get(
-            url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl
+            url,
+            proxies=self.proxy_dict,
+            headers=headers,
+            cookies=self.cookies,
+            timeout=15,
+            verify=self.verify_ssl,
         )
 
         # Update the cookies.
@@ -341,7 +352,6 @@ def get_page(self, url):
         # See https://github.com/benbusby/whoogle-search/issues/311
         try:
             if response.cookies["CONSENT"].startswith("PENDING+"):
-
                 ROOT_LOGGER.warning(
                     "Looks like your IP address is sourcing from a European Union location...your search results may "
                     "vary, but I'll try and work around this by updating the cookie."
@@ -381,7 +391,6 @@ def get_page(self, url):
             html = response.text
 
         elif http_response_code == 429:
-
             ROOT_LOGGER.warning("Google is blocking your IP for making too many requests in a specific time period.")
 
             # Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry.  Just return a
@@ -431,7 +440,6 @@ def search(self):
         # Loop until we reach the maximum result results found or there are no more search results found to reach
         # max_search_result_urls_to_return.
         while total_valid_links_found <= self.max_search_result_urls_to_return:
-
             ROOT_LOGGER.info(
                 f"Stats: start={self.start}, num={self.num}, total_valid_links_found={total_valid_links_found} / "
                 f"max_search_result_urls_to_return={self.max_search_result_urls_to_return}"
@@ -484,7 +492,6 @@ def search(self):
 
             # Process every anchored URL.
             for a in anchors:
-
                 # Get the URL from the anchor tag.
                 try:
                     link = a["href"]
@@ -498,7 +505,6 @@ def search(self):
                     continue
 
                 if self.verbose_output:
-
                     # Extract the URL title.
                     try:
                         title = a.get_text()
@@ -520,7 +526,6 @@ def search(self):
 
                 # Check if URL has already been found.
                 if link not in self.search_result_list:
-
                     # Increase the counters.
                     valid_links_found_in_this_search += 1
                     total_valid_links_found += 1