1111
1212
1313BASE_SALARY_RANGE_INDIA = (2_00_000 , 100_00_000 )
14+ TOTAL_SALARY_RANGE_INDIA = (2_00_000 , 200_00_000 )
15+ TOTAL_TO_BASE_MAX_RATIO = 2.5
1416INTERN_SALARY_RANGE_INDIA = (10_000 , 2_00_000 )
1517
1618LABEL_SPECIFICATION = {
2426 "RE_SALARY" : re .compile (r"(salary|base|base pay)\s?[:-]-?\s?(?P<label>[\w\,\₹\$\.\/\-\(\)\`\\u20b9₹\~ ]+)" ),
2527 "RE_LOCATION" : re .compile (r"location\s?[:-]-?\s?(?P<label>[\w\,\` ]+)" ),
2628 "RE_SALARY_TOTAL" : re .compile (
27- r"\ntot?al (1st year\s)?(comp[e|a]nsation|comp|ctc)(\sfor 1st year)?(\s?\(\s?(salary|base).+?\))?(?P<label>.+)"
29+ r"\\ ntot?al (1st year\s)?(comp[e|a]nsation|comp|ctc)(\sfor 1st year)?(\s?\(\s?(salary|base).+?\))?(?P<label>.+)"
2830 ),
2931 "RE_SALARY_CLEAN_LPA" : re .compile (r"(\d{1,3}(\.\d{1,2})?)\s?(lpa|lakh|lac|l)" ),
3032}
@@ -63,16 +65,17 @@ def _find_matches(regex_pattern: Pattern[str], content: str) -> List[str]:
6365
6466
6567def _get_info_as_flat_list (
66- companies : List [str ], titles : List [str ], yoes : List [str ], salaries : List [str ], info : Dict [str , Any ]
68+ companies : List [str ], roles : List [str ], yoes : List [str ], pays : List [ str ], pays_t : List [str ], info : Dict [str , Any ]
6769) -> List [Dict [str , Any ]]:
68- n_info = min ([len (companies ), len (titles ), len (yoes ), len (salaries )])
70+ n_info = min ([len (companies ), len (roles ), len (yoes ), len (pays )])
6971 expanded_info = []
7072 for _ in range (n_info ):
7173 _info = info .copy ()
7274 _info ["company" ] = companies [0 ]
73- _info ["role" ] = titles [0 ]
75+ _info ["role" ] = roles [0 ]
7476 _info ["yoe" ] = yoes [0 ]
75- _info ["salary" ] = salaries [0 ]
77+ _info ["salary" ] = pays [0 ]
78+ _info ["salaryTotal" ] = pays_t [0 ] if pays_t else ""
7679 expanded_info .append (_info )
7780 return expanded_info
7881
@@ -133,6 +136,9 @@ def _report(raw_info: List[Dict[str, Any]]) -> None:
133136 logger .info (f"Posts with Location: { len ([r for r in raw_info if 'country' in r ])} " )
134137 logger .info (f"Posts with YOE: { len ([r for r in raw_info if r ['cleanYoe' ] >= 0 ])} " )
135138 logger .info (f"Posts from India: { len ([r for r in raw_info if 'country' in r and r ['country' ] == 'india' ])} " )
139+ logger .info (
140+ f"Posts with Total Comp: { len ([r for r in raw_info if 'cleanSalaryTotal' in r and r ['cleanSalaryTotal' ] != - 1.0 ])} "
141+ )
136142
137143
138144def _is_valid_yearly_base_pay_from_india (base_pay : float ):
@@ -143,13 +149,23 @@ def _is_valid_monthly_internship_pay_from_india(base_pay: float):
143149 return base_pay >= INTERN_SALARY_RANGE_INDIA [0 ] and base_pay <= INTERN_SALARY_RANGE_INDIA [1 ]
144150
145151
152+ def _is_valid_monthly_total_pay_from_india (base_pay : float ):
153+ return base_pay >= TOTAL_SALARY_RANGE_INDIA [0 ] and base_pay <= TOTAL_SALARY_RANGE_INDIA [1 ]
154+
155+
146156def _filter_invalid_salaries (raw_info : List [Dict [str , Any ]]) -> List [Dict [str , Any ]]:
147157 n_india = 0
148158 n_dropped = 0
149159 filtered_info = []
150160 for r in raw_info :
151161 if "country" in r and r ["country" ] == "india" :
152162 n_india += 1
163+ if "cleanSalaryTotal" in r and r ["cleanSalaryTotal" ] != - 1 :
164+ if not _is_valid_monthly_total_pay_from_india (r ["cleanSalaryTotal" ]):
165+ r ["cleanSalaryTotal" ] = - 1.0
166+ elif r ["cleanSalaryTotal" ] / r ["cleanSalary" ] > TOTAL_TO_BASE_MAX_RATIO :
167+ r ["cleanSalaryTotal" ] = - 1.0
168+
153169 if r ["yrOrPm" ] == "yearly" and not _is_valid_yearly_base_pay_from_india (r ["cleanSalary" ]):
154170 n_dropped += 1
155171 continue
@@ -162,6 +178,28 @@ def _filter_invalid_salaries(raw_info: List[Dict[str, Any]]) -> List[Dict[str, A
162178 return filtered_info
163179
164180
181+ def _add_clean_yoe_and_salaries (expanded_info : List [Dict [str , Any ]], info : Dict [str , Any ], title : str ) -> None :
182+ for info in expanded_info :
183+ info ["cleanYoe" ] = _get_clean_yoe (info ["yoe" ].lower (), _preprocess_text (title ).lower (), info ["role" ].lower ())
184+ if "country" in info and info ["country" ] == "india" :
185+ if "\\ n" in info ["salary" ].replace ("," , "" ).lower ():
186+ info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
187+ info ["salary" ].replace ("," , "" ).lower ().split ("\\ n" )[0 ]
188+ )
189+ else :
190+ info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
191+ info ["salary" ].replace ("," , "" ).lower ()
192+ )
193+ if info ["yrOrPm" ] == "yearly" :
194+ total_salary , _ = _get_clean_salary_for_india (
195+ info ["salaryTotal" ].replace ("," , "" ).lower ().split ("\\ n" )[0 ]
196+ )
197+ if info ["cleanSalary" ] != - 1 and total_salary > info ["cleanSalary" ]:
198+ info ["cleanSalaryTotal" ] = total_salary
199+ else :
200+ info ["cleanSalaryTotal" ] = - 1
201+
202+
165203def _get_clean_company_text (company : str ) -> str :
166204 return " " .join (re .findall (r"\w+" , company .lower ()))
167205
@@ -179,6 +217,14 @@ def _add_clean_companies(raw_info: List[Dict[str, Any]]) -> None:
179217 r ["cleanCompany" ] = " " .join ([txt .capitalize () for txt in clean_company .split (" " )])
180218
181219
220+ def _drop_info (raw_info : List [Dict [str , Any ]]) -> None :
221+ for r in raw_info :
222+ try :
223+ del r ["title" ], r ["yoe" ], r ["salary" ], r ["salaryTotal" ], r ["city" ], r ["country" ]
224+ except KeyError :
225+ continue
226+
227+
182228def _save_raw_info (raw_info : List [Dict [str , Any ]]) -> None :
183229 with open ("data/posts_info.json" , "w" ) as f :
184230 json .dump (raw_info , f )
@@ -197,6 +243,9 @@ def _save_meta_info(total_posts: int, raw_info: List[Dict[str, Any]]) -> Dict[st
197243 meta_info = {
198244 "totalPosts" : total_posts ,
199245 "totalPostsFromIndia" : len ([r for r in raw_info if "country" in r and r ["country" ] == "india" ]),
246+ "totalPostsWithTotalComp" : len (
247+ [r for r in raw_info if "cleanSalaryTotal" in r and r ["cleanSalaryTotal" ] != - 1.0 ]
248+ ),
200249 "lastUpdated" : datetime .now ().strftime ("%Y/%m/%d %H:%M:%S" ),
201250 "top20Companies" : top_20 ,
202251 "mostOffersInLastMonth" : most_offers ,
@@ -229,39 +278,28 @@ def parse_posts_and_save_tagged_info() -> None:
229278 roles = _find_matches (LABEL_SPECIFICATION ["RE_ROLE" ], clean_content )
230279 yoes = _find_matches (LABEL_SPECIFICATION ["RE_YOE" ], clean_content )
231280 salaries = _find_matches (LABEL_SPECIFICATION ["RE_SALARY" ], clean_content )
281+ total_salaies = _find_matches (LABEL_SPECIFICATION ["RE_SALARY_TOTAL" ], clean_content )
232282 if companies and roles and yoes and salaries :
233- expanded_info = _get_info_as_flat_list (companies , roles , yoes , salaries , info )
283+ expanded_info = _get_info_as_flat_list (companies , roles , yoes , salaries , total_salaies , info )
234284 location = _get_clean_location (_preprocess_text (r .title ), clean_content )
235285 if location [1 ]:
236286 for info in expanded_info :
237287 info ["city" ] = location [0 ]; info ["country" ] = location [1 ]
238- for info in expanded_info :
239- info ["cleanYoe" ] = _get_clean_yoe (
240- info ["yoe" ].lower (), _preprocess_text (r .title ).lower (), info ["role" ].lower ()
241- )
242- if "country" in info and info ["country" ] == "india" :
243- if "\\ n" in info ["salary" ].replace ("," , "" ).lower ():
244- info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
245- info ["salary" ].replace ("," , "" ).lower ().split ("\\ n" )[0 ]
246- )
247- else :
248- info ["cleanSalary" ], info ["yrOrPm" ] = _get_clean_salary_for_india (
249- info ["salary" ].replace ("," , "" ).lower ()
250- )
288+ _add_clean_yoe_and_salaries (expanded_info , info , r .title )
251289 raw_info += expanded_info
252290 else :
253291 n_dropped += 1
254292 # fmt: on
255-
256293 logger .info (f"Total posts: { total_posts } " )
257294 logger .info (f"N posts dropped (missing data): { n_dropped } " )
258295 _report (raw_info )
259296 raw_info = _filter_invalid_salaries (raw_info )
260297
261298 _add_clean_companies (raw_info )
262299 raw_info = sorted (raw_info , key = lambda x : x ["date" ], reverse = True )
263- _save_raw_info (raw_info )
264300 meta_info = _save_meta_info (total_posts , raw_info )
301+ _drop_info (raw_info )
302+ _save_raw_info (raw_info )
265303 _update_data_in_js (raw_info , meta_info )
266304
267305
0 commit comments