Skip to content

Commit 955f186

Browse files
refactor search jobs formula
Signed-off-by: GuillaumeFalourd <guillaume.falourd@zup.com.br>
1 parent adf2b0a commit 955f186

File tree

3 files changed

+88
-88
lines changed

3 files changed

+88
-88
lines changed

linkedin/search/jobs/src/classes/csv.py

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,14 @@ def filename(job, city):
1616
return filename
1717

1818
def generate_file(csv_filename, job, city, links):
19-
try:
20-
with open(csv_filename, 'w', encoding='utf-8') as f:
21-
headers = ['Source', 'Organization', 'Job Title', 'Location', 'Posted', 'Applicants Hired', 'Seniority Level', 'Employment Type', 'Job Function', 'Industry']
22-
write = csv.writer(f, dialect='excel')
23-
write.writerow(headers)
24-
25-
for job_link in links:
26-
job_datas = scrap.get_datas(job, city, job_link)
27-
write.writerows([job_datas])
28-
29-
print(f'\033[1;33m\n🕵️ Written all information in: {csv_filename}\033[0m')
30-
31-
except requests.HTTPError as err:
32-
print(f'\033[0;31m❌ Something went wrong!\033[0m', err)
33-
19+
with open(csv_filename, 'w', encoding='utf-8') as f:
20+
headers = ['Source', 'Organization', 'Job Title', 'Location', 'Posted', 'Applicants Hired', 'Seniority Level', 'Employment Type', 'Job Function', 'Industry']
21+
write = csv.writer(f, dialect='excel')
22+
write.writerow(headers)
23+
24+
for job_link in links:
25+
job_datas = scrap.get_datas(job, city, job_link)
26+
write.writerows([job_datas])
3427

3528
def check_file(filename):
3629
for root, dirs, files in os.walk(f'{os.getcwd()}'):

linkedin/search/jobs/src/classes/scrap.py

Lines changed: 77 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -6,80 +6,84 @@
66
from bs4 import BeautifulSoup as soup
77

88
def get_datas(job, city, job_link):
9-
for retry in range(5):
10-
time.sleep(5)
11-
page_req = requests.get(
12-
url = job_link,
13-
headers = {'User-agent': f'{job}_{city} bot'}
14-
)
15-
if page_req.status_code == "429":
16-
change_ip(random.randint(1, 30))
17-
time.sleep(3)
18-
continue
19-
else:
20-
page_req.raise_for_status()
21-
break
22-
23-
# Parse HTML
24-
job_soup = soup(page_req.text, 'html.parser')
25-
my_data = [job_link]
26-
27-
# Topcard scraping
28-
for content in job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]:
29-
30-
# Scraping Organization Names
31-
orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
32-
'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}
33-
34-
if orgs['Default-Org'] == []:
35-
org = orgs['Flavor-Org'][0]
36-
my_data.append(org)
37-
else:
38-
for org in orgs['Default-Org']:
9+
try:
10+
for retry in range(5):
11+
time.sleep(5)
12+
page_req = requests.get(
13+
url = job_link,
14+
headers = {'User-agent': f'{job}_{city} bot'}
15+
)
16+
if page_req.status_code == "429":
17+
change_ip(random.randint(1, 30))
18+
time.sleep(3)
19+
continue
20+
else:
21+
page_req.raise_for_status()
22+
break
23+
24+
# Parse HTML
25+
job_soup = soup(page_req.text, 'html.parser')
26+
my_data = [job_link]
27+
28+
# Topcard scraping
29+
for content in job_soup.findAll('div', {'class': 'topcard__content-left'})[0:]:
30+
31+
# Scraping Organization Names
32+
orgs = {'Default-Org': [org.text for org in content.findAll('a', {'class': 'topcard__org-name-link topcard__flavor--black-link'})],
33+
'Flavor-Org': [org.text for org in content.findAll('span', {'class': 'topcard__flavor'})]}
34+
35+
if orgs['Default-Org'] == []:
36+
org = orgs['Flavor-Org'][0]
3937
my_data.append(org)
40-
41-
# Scraping Job Title
42-
for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
43-
print(f'\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
44-
my_data.append(title.text.replace(',', '.'))
45-
46-
for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
47-
my_data.append(location.text.replace(',', '.'))
48-
49-
# Scraping Job Time Posted
50-
posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
51-
'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}
52-
53-
if posts['New'] == []:
54-
for text in posts['Old']:
55-
my_data.append(text)
56-
else:
57-
for text in posts['New']:
58-
my_data.append(text)
59-
60-
# Scraping Number of Applicants Hired
61-
applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
62-
'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}
63-
64-
if applicants['Current'] == []:
65-
for applicant in applicants['More-Than']:
66-
my_data.append(f'{get_nums(applicant)}+ Applicants')
67-
else:
68-
for applicant in applicants['Current']:
69-
my_data.append(f'{get_nums(applicant)} Applicants')
70-
71-
# Criteria scraping
72-
for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
73-
my_data.append(criteria.text)
74-
75-
print("Datas:", my_data)
76-
77-
if len(my_data) < 10:
78-
fill_number = 10 - len(my_data)
79-
for i in range(0, fill_number):
80-
my_data.append('')
81-
i += 1
82-
38+
else:
39+
for org in orgs['Default-Org']:
40+
my_data.append(org)
41+
42+
# Scraping Job Title
43+
for title in content.findAll('h1', {'class': 'topcard__title'})[0:]:
44+
print(f'\033[0;32m📌 {title.text}\033[0m', f'\033[1;33m- {org}\033[0m')
45+
my_data.append(title.text.replace(',', '.'))
46+
47+
for location in content.findAll('span', {'class': 'topcard__flavor topcard__flavor--bullet'})[0:]:
48+
my_data.append(location.text.replace(',', '.'))
49+
50+
# Scraping Job Time Posted
51+
posts = {'Old': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'})],
52+
'New': [posted.text for posted in content.findAll('span', {'class': 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new'})]}
53+
54+
if posts['New'] == []:
55+
for text in posts['Old']:
56+
my_data.append(text)
57+
else:
58+
for text in posts['New']:
59+
my_data.append(text)
60+
61+
# Scraping Number of Applicants Hired
62+
applicants = {'More-Than': [applicant.text for applicant in content.findAll('figcaption', {'class': 'num-applicants__caption'})],
63+
'Current': [applicant.text for applicant in content.findAll('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'})]}
64+
65+
if applicants['Current'] == []:
66+
for applicant in applicants['More-Than']:
67+
my_data.append(f'{get_nums(applicant)}+ Applicants')
68+
else:
69+
for applicant in applicants['Current']:
70+
my_data.append(f'{get_nums(applicant)} Applicants')
71+
72+
# Criteria scraping
73+
for criteria in job_soup.findAll('span', {'class': 'job-criteria__text job-criteria__text--criteria'})[:4]:
74+
my_data.append(criteria.text)
75+
76+
print("Datas:", my_data)
77+
78+
if len(my_data) < 10:
79+
fill_number = 10 - len(my_data)
80+
for i in range(0, fill_number):
81+
my_data.append('')
82+
i += 1
83+
84+
except requests.HTTPError as err:
85+
print(f'\033[0;31m❌ Something went wrong!\033[0m', err)
86+
8387
return my_data
8488

8589
def get_nums(string):

linkedin/search/jobs/src/formula/formula.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,13 @@ def run(city, profession, send_email, email_receiver, sendgrid_api_key, sendgrid
3030
print(f"\033[1;36m\n⚠️ Couldn't extract job links list from LinkedIn, try again later!\033[0m")
3131
else:
3232
print(f'\033[1;33m\n🕵️ There are {len(links)} available {job} jobs in {city.capitalize()}.\n\033[0m')
33+
3334
# Extract Datas into a CSV file
3435
csv_filename = csv.filename(job, city)
3536
csv.generate_file(csv_filename, job, city, job_links)
3637
csv.check_file(csv_filename)
38+
39+
print(f'\033[1;33m\n🕵️ Written all information in: {csv_filename}\033[0m')
3740

3841
if send_email == "yes":
3942
if sendgrid_api_key is not None:

0 commit comments

Comments
 (0)