12
12
13
13
# Custom Python libraries.
14
14
15
- __version__ = "1.6.1 "
15
+ __version__ = "1.7.0 "
16
16
17
17
# Logging
18
18
ROOT_LOGGER = logging .getLogger ("yagooglesearch" )
@@ -86,8 +86,8 @@ def __init__(
86
86
verify_ssl = True ,
87
87
verbosity = 5 ,
88
88
verbose_output = False ,
89
+ google_exemption = None ,
89
90
):
90
-
91
91
"""
92
92
SearchClient
93
93
:param str query: Query string. Must NOT be url-encoded.
@@ -118,6 +118,8 @@ def __init__(
118
118
This may need to be disabled in some HTTPS proxy instances.
119
119
:param int verbosity: Logging and console output verbosity.
120
120
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
121
+ :param str google_exemption: Google cookie exemption string. This is a string that Google uses to allow certain
122
+ google searches. Defaults to None.
121
123
122
124
:rtype: List of str
123
125
:return: List of URLs found or list of {"rank", "title", "description", "url"}
@@ -142,6 +144,7 @@ def __init__(
142
144
self .verify_ssl = verify_ssl
143
145
self .verbosity = verbosity
144
146
self .verbose_output = verbose_output
147
+ self .google_exemption = google_exemption
145
148
146
149
# Assign log level.
147
150
ROOT_LOGGER .setLevel ((6 - self .verbosity ) * 10 )
@@ -151,8 +154,12 @@ def __init__(
151
154
ROOT_LOGGER .warning ("The largest value allowed by Google for num is 100. Setting num to 100." )
152
155
self .num = 100
153
156
154
- # Initialize cookies to None, will be updated with each request in get_page().
155
- self .cookies = None
157
+ # Populate cookies with GOOGLE_ABUSE_EXEMPTION if it is provided. Otherwise, initialize cookies to None.
158
+ # It will be updated with each request in get_page().
159
+ if self .google_exemption :
160
+ self .cookies = {"GOOGLE_ABUSE_EXEMPTION" : self .google_exemption }
161
+ else :
162
+ self .cookies = None
156
163
157
164
# Used later to ensure there are not any URL parameter collisions.
158
165
self .url_parameters = (
@@ -178,7 +185,6 @@ def __init__(
178
185
179
186
# Update proxy_dict if a proxy is provided.
180
187
if proxy :
181
-
182
188
# Standardize case since the scheme will be checked against a hard-coded list.
183
189
self .proxy = proxy .lower ()
184
190
@@ -321,7 +327,12 @@ def get_page(self, url):
321
327
322
328
ROOT_LOGGER .info (f"Requesting URL: { url } " )
323
329
response = requests .get (
324
- url , proxies = self .proxy_dict , headers = headers , cookies = self .cookies , timeout = 15 , verify = self .verify_ssl
330
+ url ,
331
+ proxies = self .proxy_dict ,
332
+ headers = headers ,
333
+ cookies = self .cookies ,
334
+ timeout = 15 ,
335
+ verify = self .verify_ssl ,
325
336
)
326
337
327
338
# Update the cookies.
@@ -341,7 +352,6 @@ def get_page(self, url):
341
352
# See https://github.com/benbusby/whoogle-search/issues/311
342
353
try :
343
354
if response .cookies ["CONSENT" ].startswith ("PENDING+" ):
344
-
345
355
ROOT_LOGGER .warning (
346
356
"Looks like your IP address is sourcing from a European Union location...your search results may "
347
357
"vary, but I'll try and work around this by updating the cookie."
@@ -381,7 +391,6 @@ def get_page(self, url):
381
391
html = response .text
382
392
383
393
elif http_response_code == 429 :
384
-
385
394
ROOT_LOGGER .warning ("Google is blocking your IP for making too many requests in a specific time period." )
386
395
387
396
# Calling script does not want yagooglesearch to handle HTTP 429 cool off and retry. Just return a
@@ -431,7 +440,6 @@ def search(self):
431
440
# Loop until we reach the maximum result results found or there are no more search results found to reach
432
441
# max_search_result_urls_to_return.
433
442
while total_valid_links_found <= self .max_search_result_urls_to_return :
434
-
435
443
ROOT_LOGGER .info (
436
444
f"Stats: start={ self .start } , num={ self .num } , total_valid_links_found={ total_valid_links_found } / "
437
445
f"max_search_result_urls_to_return={ self .max_search_result_urls_to_return } "
@@ -484,7 +492,6 @@ def search(self):
484
492
485
493
# Process every anchored URL.
486
494
for a in anchors :
487
-
488
495
# Get the URL from the anchor tag.
489
496
try :
490
497
link = a ["href" ]
@@ -498,7 +505,6 @@ def search(self):
498
505
continue
499
506
500
507
if self .verbose_output :
501
-
502
508
# Extract the URL title.
503
509
try :
504
510
title = a .get_text ()
@@ -520,7 +526,6 @@ def search(self):
520
526
521
527
# Check if URL has already been found.
522
528
if link not in self .search_result_list :
523
-
524
529
# Increase the counters.
525
530
valid_links_found_in_this_search += 1
526
531
total_valid_links_found += 1
0 commit comments