@@ -42,25 +42,25 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=
42
42
base_url = "https://www.google.com/search?tbm=isch&hl=en"
43
43
keywords_str = "&q=" + quote (keywords )
44
44
query_url = base_url + keywords_str
45
-
45
+
46
46
if safe_mode is True :
47
47
query_url += "&safe=on"
48
48
else :
49
49
query_url += "&safe=off"
50
-
50
+
51
51
filter_url = "&tbs="
52
52
53
53
if color is not None :
54
54
if color == "bw" :
55
55
filter_url += "ic:gray%2C"
56
56
else :
57
57
filter_url += "ic:specific%2Cisc:{}%2C" .format (color .lower ())
58
-
58
+
59
59
if image_type is not None :
60
60
if image_type .lower () == "linedrawing" :
61
61
image_type = "lineart"
62
62
filter_url += "itp:{}" .format (image_type )
63
-
63
+
64
64
if face_only is True :
65
65
filter_url += "itp:face"
66
66
@@ -73,7 +73,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
73
73
thumb_elements = []
74
74
while True :
75
75
try :
76
- thumb_elements = driver .find_elements (By .CLASS_NAME , "rg_i" )
76
+ # old way to get thumb_elements
77
+ # thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
78
+ # Adapt to the updated Google image search page
79
+ thumb_elements = driver .find_elements (By .CSS_SELECTOR , ".H8Rx8c > g-img > img" )
77
80
my_print ("Find {} images." .format (len (thumb_elements )), quiet )
78
81
if len (thumb_elements ) >= max_number :
79
82
break
@@ -90,7 +93,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
90
93
except Exception as e :
91
94
print ("Exception " , e )
92
95
pass
93
-
96
+
94
97
if len (thumb_elements ) == 0 :
95
98
return []
96
99
@@ -109,16 +112,17 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
109
112
print ("Error while clicking in thumbnail:" , e )
110
113
retry_click .append (elem )
111
114
112
- if len (retry_click ) > 0 :
115
+ if len (retry_click ) > 0 :
113
116
my_print ("Retry some failed clicks ..." , quiet )
114
117
for elem in retry_click :
115
118
try :
116
119
if elem .is_displayed () and elem .is_enabled ():
117
120
elem .click ()
118
121
except Exception as e :
119
122
print ("Error while retrying click:" , e )
120
-
121
- image_elements = driver .find_elements (By .CLASS_NAME , "islib" )
123
+
124
+ # image_elements = driver.find_elements(By.CLASS_NAME, "islib")
125
+ image_elements = driver .find_elements (By .CSS_SELECTOR , ".ob5Hkd > a" )
122
126
image_urls = list ()
123
127
url_pattern = r"imgurl=\S*&imgrefurl"
124
128
@@ -138,10 +142,10 @@ def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=No
138
142
filter_url = "&qft="
139
143
if face_only is True :
140
144
filter_url += "+filterui:face-face"
141
-
145
+
142
146
if image_type is not None :
143
147
filter_url += "+filterui:photo-{}" .format (image_type )
144
-
148
+
145
149
if color is not None :
146
150
if color == "bw" or color == "color" :
147
151
filter_url += "+filterui:color2-{}" .format (color .lower ())
@@ -183,7 +187,7 @@ def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False,
183
187
proxies = None
184
188
if proxy and proxy_type :
185
189
proxies = {"http" : "{}://{}" .format (proxy_type , proxy ),
186
- "https" : "{}://{}" .format (proxy_type , proxy )}
190
+ "https" : "{}://{}" .format (proxy_type , proxy )}
187
191
start = 1
188
192
image_urls = []
189
193
while start <= max_number :
@@ -309,7 +313,7 @@ def process_batch(batch_no, batch_size):
309
313
310
314
311
315
def crawl_image_urls (keywords , engine = "Google" , max_number = 10000 ,
312
- face_only = False , safe_mode = False , proxy = None ,
316
+ face_only = False , safe_mode = False , proxy = None ,
313
317
proxy_type = "http" , quiet = False , browser = "chrome_headless" , image_type = None , color = None ):
314
318
"""
315
319
Scrape image urls of keywords from Google Image Search
0 commit comments