Skip to content

Commit de560c8

Browse files
author
qingmu.li
committed
feat: Adapt to Google page updates
1 parent 2ff2ce0 commit de560c8

File tree

3 files changed

+21
-16
lines changed

3 files changed

+21
-16
lines changed

crawler.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -42,25 +42,25 @@ def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=
4242
base_url = "https://www.google.com/search?tbm=isch&hl=en"
4343
keywords_str = "&q=" + quote(keywords)
4444
query_url = base_url + keywords_str
45-
45+
4646
if safe_mode is True:
4747
query_url += "&safe=on"
4848
else:
4949
query_url += "&safe=off"
50-
50+
5151
filter_url = "&tbs="
5252

5353
if color is not None:
5454
if color == "bw":
5555
filter_url += "ic:gray%2C"
5656
else:
5757
filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower())
58-
58+
5959
if image_type is not None:
6060
if image_type.lower() == "linedrawing":
6161
image_type = "lineart"
6262
filter_url += "itp:{}".format(image_type)
63-
63+
6464
if face_only is True:
6565
filter_url += "itp:face"
6666

@@ -73,7 +73,10 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
7373
thumb_elements = []
7474
while True:
7575
try:
76-
thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
76+
# old way to get thumb_elements
77+
# thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
78+
# Adapt to the updated Google image search page
79+
thumb_elements = driver.find_elements(By.CSS_SELECTOR, ".H8Rx8c > g-img > img")
7780
my_print("Find {} images.".format(len(thumb_elements)), quiet)
7881
if len(thumb_elements) >= max_number:
7982
break
@@ -90,7 +93,7 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
9093
except Exception as e:
9194
print("Exception ", e)
9295
pass
93-
96+
9497
if len(thumb_elements) == 0:
9598
return []
9699

@@ -109,16 +112,17 @@ def google_image_url_from_webpage(driver, max_number, quiet=False):
109112
print("Error while clicking in thumbnail:", e)
110113
retry_click.append(elem)
111114

112-
if len(retry_click) > 0:
115+
if len(retry_click) > 0:
113116
my_print("Retry some failed clicks ...", quiet)
114117
for elem in retry_click:
115118
try:
116119
if elem.is_displayed() and elem.is_enabled():
117120
elem.click()
118121
except Exception as e:
119122
print("Error while retrying click:", e)
120-
121-
image_elements = driver.find_elements(By.CLASS_NAME, "islib")
123+
124+
# image_elements = driver.find_elements(By.CLASS_NAME, "islib")
125+
image_elements = driver.find_elements(By.CSS_SELECTOR, ".ob5Hkd > a")
122126
image_urls = list()
123127
url_pattern = r"imgurl=\S*&imgrefurl"
124128

@@ -138,10 +142,10 @@ def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=No
138142
filter_url = "&qft="
139143
if face_only is True:
140144
filter_url += "+filterui:face-face"
141-
145+
142146
if image_type is not None:
143147
filter_url += "+filterui:photo-{}".format(image_type)
144-
148+
145149
if color is not None:
146150
if color == "bw" or color == "color":
147151
filter_url += "+filterui:color2-{}".format(color.lower())
@@ -183,7 +187,7 @@ def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False,
183187
proxies = None
184188
if proxy and proxy_type:
185189
proxies = {"http": "{}://{}".format(proxy_type, proxy),
186-
"https": "{}://{}".format(proxy_type, proxy)}
190+
"https": "{}://{}".format(proxy_type, proxy)}
187191
start = 1
188192
image_urls = []
189193
while start <= max_number:
@@ -309,7 +313,7 @@ def process_batch(batch_no, batch_size):
309313

310314

311315
def crawl_image_urls(keywords, engine="Google", max_number=10000,
312-
face_only=False, safe_mode=False, proxy=None,
316+
face_only=False, safe_mode=False, proxy=None,
313317
proxy_type="http", quiet=False, browser="chrome_headless", image_type=None, color=None):
314318
"""
315319
Scrape image urls of keywords from Google Image Search

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
chromedriver-autoinstaller==0.4.0
22
pyinstaller==5.9.0
3-
PyQt5==5.15.9
3+
PyQt5==5.15.10
44
requests==2.31.0
55
selenium==4.8.3

utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def gen_valid_dir_name_for_keywords(keywords):
1313
class AppConfig(object):
1414
def __init__(self):
1515
self.engine = "Google"
16-
16+
1717
self.driver = "chrome_headless"
1818

1919
self.keywords = ""
@@ -33,7 +33,7 @@ def __init__(self):
3333

3434
def to_command_paras(self):
3535
str_paras = ""
36-
36+
3737
str_paras += ' -e ' + self.engine
3838

3939
str_paras += ' -d ' + self.driver
@@ -72,6 +72,7 @@ def gen_keywords_list_from_file(filepath):
7272
def resolve_dependencies(driver=str):
7373
if "chrome" in driver:
7474
print("Checking Google Chrome and chromedriver ...")
75+
# if you have installed chronmium/chrome and chromedriver of the same version and still get an error, you can try commenting out the following three lines.
7576
driver_path = chromedriver_autoinstaller.install()
7677
if not driver_path:
7778
return False

0 commit comments

Comments
 (0)