Skip to content

Commit 985487d

Browse files
authored
Merge pull request #21 from ArshansGithub/patch-1
Add GOOGLE_ABUSE_EXEMPTION cookie
2 parents d1c7935 + c4cac70 commit 985487d

File tree

3 files changed

+24
-13
lines changed

3 files changed

+24
-13
lines changed

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ for search_query in search_queries:
242242
proxy_rotation_index += 1
243243
```
244244

245+
## GOOGLE_ABUSE_EXEMPTION cookie
246+
247+
If you have a `GOOGLE_ABUSE_EXEMPTION` cookie value, it can be passed into `google_exemption` when instantiating the
248+
`SearchClient` object.
249+
245250
## &tbs= URL filter clarification
246251

247252
The `&tbs=` parameter is used to specify either verbatim or time-based filters.
@@ -291,3 +296,4 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops
291296
## Contributors
292297

293298
* [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
299+
* [ArshansGithub](https://github.com/ArshansGithub) - <https://github.com/opsdisk/yagooglesearch/pull/21>

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
beautifulsoup4>=4.9.3
2-
requests>=2.26.0
2+
requests>=2.31.0
33
requests[socks]

yagooglesearch/__init__.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
# Custom Python libraries.
1414

15-
__version__ = "1.6.1"
15+
__version__ = "1.7.0"
1616

1717
# Logging
1818
ROOT_LOGGER = logging.getLogger("yagooglesearch")
@@ -86,8 +86,8 @@ def __init__(
8686
verify_ssl=True,
8787
verbosity=5,
8888
verbose_output=False,
89+
google_exemption=None,
8990
):
90-
9191
"""
9292
SearchClient
9393
:param str query: Query string. Must NOT be url-encoded.
@@ -118,6 +118,8 @@ def __init__(
118118
This may need to be disabled in some HTTPS proxy instances.
119119
:param int verbosity: Logging and console output verbosity.
120120
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
121+
:param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain
122+
google searches. Defaults to None.
121123
122124
:rtype: List of str
123125
:return: List of URLs found or list of {"rank", "title", "description", "url"}
@@ -142,6 +144,7 @@ def __init__(
142144
self.verify_ssl = verify_ssl
143145
self.verbosity = verbosity
144146
self.verbose_output = verbose_output
147+
self.google_exemption = google_exemption
145148

146149
# Assign log level.
147150
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
@@ -151,8 +154,12 @@ def __init__(
151154
ROOT_LOGGER.warning("The largest value allowed by Google for num is 100. Setting num to 100.")
152155
self.num = 100
153156

154-
# Initialize cookies to None, will be updated with each request in get_page().
155-
self.cookies = None
157+
# Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None.
158+
# It will be updated with each request in get_page().
159+
if self.google_exemption:
160+
self.cookies = {"GOOGLE_ABUSE_EXEMPTION": self.google_exemption}
161+
else:
162+
self.cookies = None
156163

157164
# Used later to ensure there are not any URL parameter collisions.
158165
self.url_parameters = (
@@ -178,7 +185,6 @@ def __init__(
178185

179186
# Update proxy_dict if a proxy is provided.
180187
if proxy:
181-
182188
# Standardize case since the scheme will be checked against a hard-coded list.
183189
self.proxy = proxy.lower()
184190

@@ -321,7 +327,12 @@ def get_page(self, url):
321327

322328
ROOT_LOGGER.info(f"Requesting URL: {url}")
323329
response = requests.get(
324-
url, proxies=self.proxy_dict, headers=headers, cookies=self.cookies, timeout=15, verify=self.verify_ssl
330+
url,
331+
proxies=self.proxy_dict,
332+
headers=headers,
333+
cookies=self.cookies,
334+
timeout=15,
335+
verify=self.verify_ssl,
325336
)
326337

327338
# Update the cookies.
@@ -341,7 +352,6 @@ def get_page(self, url):
341352
# See https://github.com/benbusby/whoogle-search/issues/311
342353
try:
343354
if response.cookies["CONSENT"].startswith("PENDING+"):
344-
345355
ROOT_LOGGER.warning(
346356
"Looks like your IP address is sourcing from a European Union location...your search results may "
347357
"vary, but I'll try and work around this by updating the cookie."
@@ -381,7 +391,6 @@ def get_page(self, url):
381391
html = response.text
382392

383393
elif http_response_code == 429:
384-
385394
ROOT_LOGGER.warning("Google is blocking your IP for making too many requests in a specific time period.")
386395

387396
# Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a
@@ -431,7 +440,6 @@ def search(self):
431440
# Loop until we reach the maximum result results found or there are no more search results found to reach
432441
# max_search_result_urls_to_return.
433442
while total_valid_links_found <= self.max_search_result_urls_to_return:
434-
435443
ROOT_LOGGER.info(
436444
f"Stats: start={self.start}, num={self.num}, total_valid_links_found={total_valid_links_found} / "
437445
f"max_search_result_urls_to_return={self.max_search_result_urls_to_return}"
@@ -484,7 +492,6 @@ def search(self):
484492

485493
# Process every anchored URL.
486494
for a in anchors:
487-
488495
# Get the URL from the anchor tag.
489496
try:
490497
link = a["href"]
@@ -498,7 +505,6 @@ def search(self):
498505
continue
499506

500507
if self.verbose_output:
501-
502508
# Extract the URL title.
503509
try:
504510
title = a.get_text()
@@ -520,7 +526,6 @@ def search(self):
520526

521527
# Check if URL has already been found.
522528
if link not in self.search_result_list:
523-
524529
# Increase the counters.
525530
valid_links_found_in_this_search += 1
526531
total_valid_links_found += 1

0 commit comments

Comments
 (0)