12
12
13
13
# Custom Python libraries.
14
14
15
- __version__ = "1.7 .0"
15
+ __version__ = "1.8 .0"
16
16
17
17
# Logging
18
18
ROOT_LOGGER = logging .getLogger ("yagooglesearch" )
38
38
39
39
try :
40
40
user_agents_file = os .path .join (install_folder , "user_agents.txt" )
41
- with open (user_agents_file ) as fh :
41
+ with open (user_agents_file , "r" ) as fh :
42
42
user_agents_list = [_ .strip () for _ in fh .readlines ()]
43
43
44
44
except Exception :
45
45
user_agents_list = [USER_AGENT ]
46
46
47
47
48
+ # Load the list of result languages. Compiled by viewing the source code at https://www.google.com/advanced_search for
49
+ # the supported languages.
50
+ try :
51
+ result_languages_file = os .path .join (install_folder , "result_languages.txt" )
52
+ with open (result_languages_file , "r" ) as fh :
53
+ result_languages_list = [_ .strip ().split ("=" )[0 ] for _ in fh .readlines ()]
54
+
55
+ except Exception as e :
56
+ print (f"There was an issue loading the result languages file. Exception: { e } " )
57
+ result_languages_list = []
58
+
59
+
48
60
def get_tbs (from_date , to_date ):
49
61
"""Helper function to format the tbs parameter dates. Note that verbatim mode also uses the &tbs= parameter, but
50
62
this function is just for customized search periods.
@@ -69,7 +81,8 @@ def __init__(
69
81
self ,
70
82
query ,
71
83
tld = "com" ,
72
- lang = "en" ,
84
+ lang_html_ui = "en" ,
85
+ lang_result = "lang_en" ,
73
86
tbs = "0" ,
74
87
safe = "off" ,
75
88
start = 0 ,
@@ -92,7 +105,8 @@ def __init__(
92
105
SearchClient
93
106
:param str query: Query string. Must NOT be url-encoded.
94
107
:param str tld: Top level domain.
95
- :param str lang: Language.
108
+ :param str lang_html_ui: HTML User Interface language.
109
+ :param str lang_result: Search result language.
96
110
:param str tbs: Verbatim search or time limits (e.g., "qdr:h" => last hour, "qdr:d" => last 24 hours, "qdr:m"
97
111
=> last month).
98
112
:param str safe: Safe search.
@@ -127,7 +141,8 @@ def __init__(
127
141
128
142
self .query = urllib .parse .quote_plus (query )
129
143
self .tld = tld
130
- self .lang = lang
144
+ self .lang_html_ui = lang_html_ui
145
+ self .lang_result = lang_result .lower ()
131
146
self .tbs = tbs
132
147
self .safe = safe
133
148
self .start = start
@@ -150,6 +165,13 @@ def __init__(
150
165
ROOT_LOGGER .setLevel ((6 - self .verbosity ) * 10 )
151
166
152
167
# Argument checks.
168
+ if self .lang_result not in result_languages_list :
169
+ ROOT_LOGGER .error (
170
+ f"{ self .lang_result } is not a valid language result. See { result_languages_file } for the list of valid "
171
+ 'languages. Setting lang_result to "lang_en".'
172
+ )
173
+ self .lang_result = "lang_en"
174
+
153
175
if self .num > 100 :
154
176
ROOT_LOGGER .warning ("The largest value allowed by Google for num is 100. Setting num to 100." )
155
177
self .num = 100
@@ -171,6 +193,7 @@ def __init__(
171
193
"safe" ,
172
194
"start" ,
173
195
"tbs" ,
196
+ "lr" ,
174
197
)
175
198
176
199
# Default user agent, unless instructed by the user to change it.
@@ -215,28 +238,28 @@ def update_urls(self):
215
238
216
239
# First search requesting the default 10 search results.
217
240
self .url_search = (
218
- f"https://www.google.{ self .tld } /search?hl={ self .lang } &"
241
+ f"https://www.google.{ self .tld } /search?hl={ self .lang_html_ui } &lr= { self . lang_result } &"
219
242
f"q={ self .query } &btnG=Google+Search&tbs={ self .tbs } &safe={ self .safe } &"
220
243
f"cr={ self .country } &filter=0"
221
244
)
222
245
223
246
# Subsequent searches starting at &start= and retrieving 10 search results at a time.
224
247
self .url_next_page = (
225
- f"https://www.google.{ self .tld } /search?hl={ self .lang } &"
248
+ f"https://www.google.{ self .tld } /search?hl={ self .lang_html_ui } &lr= { self . lang_result } &"
226
249
f"q={ self .query } &start={ self .start } &tbs={ self .tbs } &safe={ self .safe } &"
227
250
f"cr={ self .country } &filter=0"
228
251
)
229
252
230
253
# First search requesting more than the default 10 search results.
231
254
self .url_search_num = (
232
- f"https://www.google.{ self .tld } /search?hl={ self .lang } &"
255
+ f"https://www.google.{ self .tld } /search?hl={ self .lang_html_ui } &lr= { self . lang_result } &"
233
256
f"q={ self .query } &num={ self .num } &btnG=Google+Search&tbs={ self .tbs } &"
234
257
f"safe={ self .safe } &cr={ self .country } &filter=0"
235
258
)
236
259
237
260
# Subsequent searches starting at &start= and retrieving &num= search results at a time.
238
261
self .url_next_page_num = (
239
- f"https://www.google.{ self .tld } /search?hl={ self .lang } &"
262
+ f"https://www.google.{ self .tld } /search?hl={ self .lang_html_ui } &lr= { self . lang_result } &"
240
263
f"q={ self .query } &start={ self .start } &num={ self .num } &tbs={ self .tbs } &"
241
264
f"safe={ self .safe } &cr={ self .country } &filter=0"
242
265
)
@@ -458,10 +481,8 @@ def search(self):
458
481
url = self .url_search_num
459
482
460
483
# Append extra GET parameters to the URL. This is done on every iteration because we're rebuilding the
461
- # entire URL at the end of this loop.
484
+ # entire URL at the end of this loop. The keys and values are not URL encoded.
462
485
for key , value in self .extra_params .items ():
463
- key = urllib .parse .quote_plus (key )
464
- value = urllib .parse .quote_plus (value )
465
486
url += f"&{ key } ={ value } "
466
487
467
488
# Request Google search results.
0 commit comments