Skip to content

Commit 2bf1db3

Browse files
update scrap retries search jobs formula
Signed-off-by: GuillaumeFalourd <guillaume.falourd@zup.com.br>
1 parent 83555af commit 2bf1db3

File tree

1 file changed

+58
-52
lines changed
  • linkedin/search/jobs/src/classes

1 file changed

+58
-52
lines changed

linkedin/search/jobs/src/classes/scrap.py

Lines changed: 58 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from bs4 import BeautifulSoup as soup
77

88
def get_datas(job, city, job_link):
9+
my_data = [job_link]
910
try:
1011
for retry in range(5):
1112
time.sleep(5)
@@ -19,59 +20,64 @@ def get_datas(job, city, job_link):
1920
continue
2021
else:
2122
page_req.raise_for_status()
22-
break
23-
24-
# Parse HTML
25-
job_soup = soup(page_req.text, 'html.parser')
26-
my_data = [job_link]
27-
28-
# Topcard scraping
29-
for content in job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]:
30-
31-
# Scraping Organization Names
32-
orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
33-
'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}
34-
35-
if orgs['Default-Org'] == []:
36-
org = orgs['Flavor-Org'][0]
37-
my_data.append(org)
38-
else:
39-
for org in orgs['Default-Org']:
23+
# Parse HTML
24+
job_soup = soup(page_req.text, 'html.parser')
25+
contents = job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]
26+
if len(contents) == 0:
27+
time.sleep(3)
28+
continue
29+
else:
30+
print(f"⚠️ Couldn't retrieve all datas for the job link: {job_link}")
31+
break
32+
33+
if len(contents) != 0:
34+
# Topcard scraping
35+
for content in contents:
36+
37+
# Scraping Organization Names
38+
orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
39+
'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}
40+
41+
if orgs['Default-Org'] == []:
42+
org = orgs['Flavor-Org'][0]
4043
my_data.append(org)
41-
42-
# Scraping Job Title
43-
for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
44-
print(f'\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
45-
my_data.append(title.text.replace(',', '.'))
46-
47-
for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
48-
my_data.append(location.text.replace(',', '.'))
49-
50-
# Scraping Job Time Posted
51-
posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
52-
'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}
53-
54-
if posts['New'] == []:
55-
for text in posts['Old']:
56-
my_data.append(text)
57-
else:
58-
for text in posts['New']:
59-
my_data.append(text)
60-
61-
# Scraping Number of Applicants Hired
62-
applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
63-
'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}
64-
65-
if applicants['Current'] == []:
66-
for applicant in applicants['More-Than']:
67-
my_data.append(f'{get_nums(applicant)}+ Applicants')
68-
else:
69-
for applicant in applicants['Current']:
70-
my_data.append(f'{get_nums(applicant)} Applicants')
71-
72-
# Criteria scraping
73-
for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
74-
my_data.append(criteria.text)
44+
else:
45+
for org in orgs['Default-Org']:
46+
my_data.append(org)
47+
48+
# Scraping Job Title
49+
for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
50+
print(f'\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
51+
my_data.append(title.text.replace(',', '.'))
52+
53+
for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
54+
my_data.append(location.text.replace(',', '.'))
55+
56+
# Scraping Job Time Posted
57+
posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
58+
'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}
59+
60+
if posts['New'] == []:
61+
for text in posts['Old']:
62+
my_data.append(text)
63+
else:
64+
for text in posts['New']:
65+
my_data.append(text)
66+
67+
# Scraping Number of Applicants Hired
68+
applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
69+
'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}
70+
71+
if applicants['Current'] == []:
72+
for applicant in applicants['More-Than']:
73+
my_data.append(f'{get_nums(applicant)}+ Applicants')
74+
else:
75+
for applicant in applicants['Current']:
76+
my_data.append(f'{get_nums(applicant)} Applicants')
77+
78+
# Criteria scraping
79+
for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
80+
my_data.append(criteria.text)
7581

7682
print("Datas:", my_data)
7783

0 commit comments

Comments
 (0)