6
6
from bs4 import BeautifulSoup as soup
7
7
8
8
def get_datas (job , city , job_link ):
9
+ my_data = [job_link ]
9
10
try :
10
11
for retry in range (5 ):
11
12
time .sleep (5 )
@@ -19,59 +20,64 @@ def get_datas(job, city, job_link):
19
20
continue
20
21
else :
21
22
page_req .raise_for_status ()
22
- break
23
-
24
- # Parse HTML
25
- job_soup = soup (page_req .text , 'html.parser' )
26
- my_data = [job_link ]
27
-
28
- # Topcard scraping
29
- for content in job_soup .findAll ('div' , {'class' : 'topcard__content-left' })[0 :]:
30
-
31
- # Scraping Organization Names
32
- orgs = {'Default-Org' : [org .text for org in content .findAll ('a' , {'class' : 'topcard__org-name-link topcard__flavor--black-link' })],
33
- 'Flavor-Org' : [org .text for org in content .findAll ('span' , {'class' : 'topcard__flavor' })]}
34
-
35
- if orgs ['Default-Org' ] == []:
36
- org = orgs ['Flavor-Org' ][0 ]
37
- my_data .append (org )
38
- else :
39
- for org in orgs ['Default-Org' ]:
23
+ # Parse HTML
24
+ job_soup = soup (page_req .text , 'html.parser' )
25
+ contents = job_soup .findAll ('div' , {'class' : 'topcard__content-left' })[0 :]
26
+ if len (contents ) == 0 :
27
+ time .sleep (3 )
28
+ continue
29
+ else :
30
+ print (f"⚠️ Couldn't retrieve all datas for the job link: { job_link } " )
31
+ break
32
+
33
+ if len (contents ) != 0 :
34
+ # Topcard scraping
35
+ for content in contents :
36
+
37
+ # Scraping Organization Names
38
+ orgs = {'Default-Org' : [org .text for org in content .findAll ('a' , {'class' : 'topcard__org-name-link topcard__flavor--black-link' })],
39
+ 'Flavor-Org' : [org .text for org in content .findAll ('span' , {'class' : 'topcard__flavor' })]}
40
+
41
+ if orgs ['Default-Org' ] == []:
42
+ org = orgs ['Flavor-Org' ][0 ]
40
43
my_data .append (org )
41
-
42
- # Scraping Job Title
43
- for title in content .findAll ('h1' , {'class' : 'topcard__title' })[0 :]:
44
- print (f'\033 [0;32m📌 { title .text } \033 [0m' , f'\033 [1;33m- { org } \033 [0m' )
45
- my_data .append (title .text .replace (',' , '.' ))
46
-
47
- for location in content .findAll ('span' , {'class' : 'topcard__flavor topcard__flavor--bullet' })[0 :]:
48
- my_data .append (location .text .replace (',' , '.' ))
49
-
50
- # Scraping Job Time Posted
51
- posts = {'Old' : [posted .text for posted in content .findAll ('span' , {'class' : 'topcard__flavor--metadata posted-time-ago__text' })],
52
- 'New' : [posted .text for posted in content .findAll ('span' , {'class' : 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new' })]}
53
-
54
- if posts ['New' ] == []:
55
- for text in posts ['Old' ]:
56
- my_data .append (text )
57
- else :
58
- for text in posts ['New' ]:
59
- my_data .append (text )
60
-
61
- # Scraping Number of Applicants Hired
62
- applicants = {'More-Than' : [applicant .text for applicant in content .findAll ('figcaption' , {'class' : 'num-applicants__caption' })],
63
- 'Current' : [applicant .text for applicant in content .findAll ('span' , {'class' : 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption' })]}
64
-
65
- if applicants ['Current' ] == []:
66
- for applicant in applicants ['More-Than' ]:
67
- my_data .append (f'{ get_nums (applicant )} + Applicants' )
68
- else :
69
- for applicant in applicants ['Current' ]:
70
- my_data .append (f'{ get_nums (applicant )} Applicants' )
71
-
72
- # Criteria scraping
73
- for criteria in job_soup .findAll ('span' , {'class' : 'job-criteria__text job-criteria__text--criteria' })[:4 ]:
74
- my_data .append (criteria .text )
44
+ else :
45
+ for org in orgs ['Default-Org' ]:
46
+ my_data .append (org )
47
+
48
+ # Scraping Job Title
49
+ for title in content .findAll ('h1' , {'class' : 'topcard__title' })[0 :]:
50
+ print (f'\033 [0;32m📌 { title .text } \033 [0m' , f'\033 [1;33m- { org } \033 [0m' )
51
+ my_data .append (title .text .replace (',' , '.' ))
52
+
53
+ for location in content .findAll ('span' , {'class' : 'topcard__flavor topcard__flavor--bullet' })[0 :]:
54
+ my_data .append (location .text .replace (',' , '.' ))
55
+
56
+ # Scraping Job Time Posted
57
+ posts = {'Old' : [posted .text for posted in content .findAll ('span' , {'class' : 'topcard__flavor--metadata posted-time-ago__text' })],
58
+ 'New' : [posted .text for posted in content .findAll ('span' , {'class' : 'topcard__flavor--metadata posted-time-ago__text posted-time-ago__text--new' })]}
59
+
60
+ if posts ['New' ] == []:
61
+ for text in posts ['Old' ]:
62
+ my_data .append (text )
63
+ else :
64
+ for text in posts ['New' ]:
65
+ my_data .append (text )
66
+
67
+ # Scraping Number of Applicants Hired
68
+ applicants = {'More-Than' : [applicant .text for applicant in content .findAll ('figcaption' , {'class' : 'num-applicants__caption' })],
69
+ 'Current' : [applicant .text for applicant in content .findAll ('span' , {'class' : 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption' })]}
70
+
71
+ if applicants ['Current' ] == []:
72
+ for applicant in applicants ['More-Than' ]:
73
+ my_data .append (f'{ get_nums (applicant )} + Applicants' )
74
+ else :
75
+ for applicant in applicants ['Current' ]:
76
+ my_data .append (f'{ get_nums (applicant )} Applicants' )
77
+
78
+ # Criteria scraping
79
+ for criteria in job_soup .findAll ('span' , {'class' : 'job-criteria__text job-criteria__text--criteria' })[:4 ]:
80
+ my_data .append (criteria .text )
75
81
76
82
print ("Datas:" , my_data )
77
83
0 commit comments