From 91ec16f92f1ff7fa16f13049cd574a23d4eb1993 Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 11:39:23 +0530 Subject: [PATCH 1/9] added fetch_qualifications module --- cvscan/__init__.py | 4 +++- cvscan/data/qualifications/degree | 4 ++++ cvscan/details_parser.py | 25 +++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 cvscan/data/qualifications/degree diff --git a/cvscan/__init__.py b/cvscan/__init__.py index 43f4836..e3bc88c 100644 --- a/cvscan/__init__.py +++ b/cvscan/__init__.py @@ -46,6 +46,7 @@ def parse(self): self.experience = dp.calculate_experience(self.raw_text) self.cleaned_resume = lp.clean_resume(self.raw_text) self.skills = lp.fetch_skills(self.cleaned_resume) + self.qualifications = dp.fetch_qualifications(self.raw_text) self.job_positions, self.category = dp.fetch_jobs(self.cleaned_resume) self.current_employers,self.employers = lp.fetch_employers( self.raw_text,self.job_positions) @@ -63,5 +64,6 @@ def show(self): "jobs" : self.job_positions, "job category" : self.category, "employers" : self.employers, - "current_employers" : self.current_employers + "current_employers" : self.current_employers, + "qualifications" : self.qualifications } \ No newline at end of file diff --git a/cvscan/data/qualifications/degree b/cvscan/data/qualifications/degree new file mode 100644 index 0000000..8ee14da --- /dev/null +++ b/cvscan/data/qualifications/degree @@ -0,0 +1,4 @@ +(lp0 +S'B.Tech' +p1 +a. \ No newline at end of file diff --git a/cvscan/details_parser.py b/cvscan/details_parser.py index cce8d75..5803600 100644 --- a/cvscan/details_parser.py +++ b/cvscan/details_parser.py @@ -214,6 +214,7 @@ def get_month_index(month): logging.error('Issue calculating experience: '+str(exception_instance)) return None + """ Utility function that fetches Job Position from the resume. @@ -252,3 +253,27 @@ def fetch_jobs(cleaned_resume): hash_jobs['Other'] = -1 return (job_positions,max(hash_jobs,key=hash_jobs.get).capitalize()) + + +""" + +Utility function that fetches degree from the resume. +Params: resume_text Type: string +returns: degree Type: List of strings + +""" +def fetch_qualifications(resume_text): + degree_path = dirpath.PKGPATH + '/data/qualifications/degree' + with open(degree_path, 'rb') as fp: + qualifications = pickle.load(fp) + + degree = [] + for qualification in qualifications: + qual_regex = r'[^a-zA-Z]'+qualification+r'[^a-zA-Z]' + regular_expression = re.compile(qual_regex,re.IGNORECASE) + regex_result = re.search(regular_expression,resume_text) + while regex_result: + degree.append(qualification) + resume_text = resume_text[regex_result.end():] + regex_result = re.search(regular_expression,resume_text) + return degree \ No newline at end of file From b0f9a87f1ad1d32cff17febef6e31a018fec7b0a Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 12:00:45 +0530 Subject: [PATCH 2/9] added data operations and entry points for qualifications --- README.md | 12 ++++++++ cvscan/cli/cli.py | 28 +++++++++++++----- cvscan/data/qualifications/degree | 4 +++ cvscan/data_operations.py | 48 +++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7fde374..cd69c84 100644 --- a/README.md +++ b/README.md @@ -50,3 +50,15 @@ cvscan add --org "Skcript" ``` cvscan remove -o "Skcript" ``` +## Qualifications +Note: +* Qualifications are case-sensitive. +* Puntuations before the first and after the last alphabet should be excluded +### add +``` +cvscan add -q "B.S,B.Tech,B.Arch" +``` +### remove +``` +cvscan remove --qual "B.Arch" +``` \ No newline at end of file diff --git a/cvscan/cli/cli.py b/cvscan/cli/cli.py index 1b4589c..6c67ef1 100644 --- a/cvscan/cli/cli.py +++ b/cvscan/cli/cli.py @@ -42,7 +42,8 @@ def parse(name): @click.option('--org','-o',help='Explicitly add organizations') @click.option('--skill','-s',help='Add skills') @click.option('--job','-j',help='For adding jobs: -j ') -def add(org,skill,job): +@click.option('--qual','-q',help="Add qualifications") +def add(org,skill,job,qual): """ Add data to be considered\n @@ -50,6 +51,7 @@ def add(org,skill,job): org Type: comma separated string\n skill Type: comma separated string\n job Type: comma separated string (comma separated - job:category)\n + qual Type: comma separated string\n Usage:\n For adding organization:\n cvscan add --org \n @@ -57,6 +59,9 @@ def add(org,skill,job): cvscan add --skill \n For adding job:\n cvscan add --job \n + For adding qualification:\n + cvscan add --qual \n + punctuations before the first and after the last alphabet are excluded\n The above can be combined together also. Eg:\n cvscan add -o -s is also valid @@ -74,13 +79,16 @@ def add(org,skill,job): except Exception: print "Something wnet wrong: " + Exception do.add_jobs(jobs) + if qual: + do.add_qualifications(qual.split(',')) @main.command() @click.option('--org','-o',help='Explicitly remove organizations') @click.option('--skill','-s',help='Remove skills') @click.option('--job','-j',help='For removing jobs -j ') -def remove(org,skill,job): +@click.option('--qual','-q',help="Remove qualifications") +def remove(org,skill,job,qual): """ Remove data from consideration\n @@ -88,15 +96,19 @@ def remove(org,skill,job): org Type: comma separated string\n skill Type: comma separated string\n job Type: comma separated string\n + qual Type: comma separated string\n Usage:\n - For adding organization:\n + For removing organization:\n cvscan remove --org \n - For adding skill:\n + For removing skill:\n cvscan remove --skill \n - For adding job:\n + For removing job:\n cvscan remove --job \n + For removing qualification:\n + cvscan remove -q \n + punctuations before the first and after the last alphabet are excluded\n The above can be combined together also. Eg:\n - cvscan remove -o -s -j + cvscan remove -o -s -j is also valid """ @@ -105,4 +117,6 @@ def remove(org,skill,job): if skill: do.remove_skills(skill.split(',')) if job: - do.remove_jobs(job.split(',')) \ No newline at end of file + do.remove_jobs(job.split(',')) + if qual: + do.remove_qualifications(qual.split(',')) \ No newline at end of file diff --git a/cvscan/data/qualifications/degree b/cvscan/data/qualifications/degree index 8ee14da..798691c 100644 --- a/cvscan/data/qualifications/degree +++ b/cvscan/data/qualifications/degree @@ -1,4 +1,8 @@ (lp0 S'B.Tech' p1 +aS'B.E' +p2 +aS'B.Arch' +p3 a. \ No newline at end of file diff --git a/cvscan/data_operations.py b/cvscan/data_operations.py index 3d91d1e..08934a0 100644 --- a/cvscan/data_operations.py +++ b/cvscan/data_operations.py @@ -169,3 +169,51 @@ def remove_jobs(jobs_to_remove): with open(DATAPATH +'job_positions/positions','wb') as fp: pickle.dump(jobs,fp) logging.debug("updated positions file") + + +""" + +An Utility function to add qualification to the degree file. +Params: qualifications Type: List of String +Qualifications are case-sensitive. +Care should be taken with the punctuations. +Exclude punctuations before the first alphabet and after the last alphabet. + +""" +def add_qualifications(quals): + with open(DATAPATH + 'qualifications/degree','rb') as fp: + qualifications = pickle.load(fp) + logging.debug("degree file loaded") + + for qual in quals: + if qual not in qualifications: + qualifications.append(qual) + logging.debug(qual + " added to qualifications") + + with open(DATAPATH + 'qualifications/degree','wb') as fp: + pickle.dump(qualifications, fp) + logging.debug("degree file written") + + +""" + +An Utility function to remove qualification from the degree file. +Params: qualifications Type: List of String +Qualifications are case-sensitive. +Care should be taken with the punctuations. +Exclude punctuations before the first alphabet and after the last alphabet. + +""" +def remove_qualifications(quals): + with open(DATAPATH + 'qualifications/degree','rb') as fp: + qualifications = pickle.load(fp) + logging.debug("degree file loaded") + + for qual in quals: + if qual in qualifications: + qualifications.remove(qual) + logging.debug(qual + " removed from qualifications") + + with open(DATAPATH + 'qualifications/degree','wb') as fp: + pickle.dump(qualifications, fp) + logging.debug("degree file written") \ No newline at end of file From fb1b753681c40e9da2e10fb4e97b43fd62a7032c Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 12:58:52 +0530 Subject: [PATCH 3/9] added degree-info feature --- cvscan/__init__.py | 6 ++++-- cvscan/data/qualifications/degree | 6 ++++++ cvscan/details_parser.py | 7 ++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/cvscan/__init__.py b/cvscan/__init__.py index e3bc88c..08e5004 100644 --- a/cvscan/__init__.py +++ b/cvscan/__init__.py @@ -46,7 +46,8 @@ def parse(self): self.experience = dp.calculate_experience(self.raw_text) self.cleaned_resume = lp.clean_resume(self.raw_text) self.skills = lp.fetch_skills(self.cleaned_resume) - self.qualifications = dp.fetch_qualifications(self.raw_text) + (self.qualifications,self.degree_info) = dp.fetch_qualifications( + self.raw_text) self.job_positions, self.category = dp.fetch_jobs(self.cleaned_resume) self.current_employers,self.employers = lp.fetch_employers( self.raw_text,self.job_positions) @@ -65,5 +66,6 @@ def show(self): "job category" : self.category, "employers" : self.employers, "current_employers" : self.current_employers, - "qualifications" : self.qualifications + "qualifications" : self.qualifications, + "qualifications_info" : self.degree_info } \ No newline at end of file diff --git a/cvscan/data/qualifications/degree b/cvscan/data/qualifications/degree index 798691c..9dca188 100644 --- a/cvscan/data/qualifications/degree +++ b/cvscan/data/qualifications/degree @@ -5,4 +5,10 @@ aS'B.E' p2 aS'B.Arch' p3 +aS'B. Tech' +p4 +aS'M.Tech' +p5 +aS'M. Tech' +p6 a. \ No newline at end of file diff --git a/cvscan/details_parser.py b/cvscan/details_parser.py index 5803600..23d73ae 100644 --- a/cvscan/details_parser.py +++ b/cvscan/details_parser.py @@ -268,6 +268,7 @@ def fetch_qualifications(resume_text): qualifications = pickle.load(fp) degree = [] + info = [] for qualification in qualifications: qual_regex = r'[^a-zA-Z]'+qualification+r'[^a-zA-Z]' regular_expression = re.compile(qual_regex,re.IGNORECASE) @@ -275,5 +276,9 @@ def fetch_qualifications(resume_text): while regex_result: degree.append(qualification) resume_text = resume_text[regex_result.end():] + lines = [line.rstrip().lstrip() + for line in resume_text.split('\n') if line.rstrip().lstrip()] + if lines: + info.append(lines[0]) regex_result = re.search(regular_expression,resume_text) - return degree \ No newline at end of file + return degree,info \ No newline at end of file From f5b1c96d8bf1811b8f1b00d4682d9548e183ee3a Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 13:27:10 +0530 Subject: [PATCH 4/9] added extra information feature --- cvscan/__init__.py | 6 +++-- cvscan/data/extra/extra | 0 cvscan/details_parser.py | 50 ++++++++++++++++++++++++++++++++++++--- cvscan/language_parser.py | 24 ------------------- 4 files changed, 51 insertions(+), 29 deletions(-) create mode 100644 cvscan/data/extra/extra diff --git a/cvscan/__init__.py b/cvscan/__init__.py index 08e5004..27ff7ce 100644 --- a/cvscan/__init__.py +++ b/cvscan/__init__.py @@ -45,12 +45,13 @@ def parse(self): self.address = dp.fetch_address(self.raw_text) self.experience = dp.calculate_experience(self.raw_text) self.cleaned_resume = lp.clean_resume(self.raw_text) - self.skills = lp.fetch_skills(self.cleaned_resume) + self.skills = dp.fetch_skills(self.cleaned_resume) (self.qualifications,self.degree_info) = dp.fetch_qualifications( self.raw_text) self.job_positions, self.category = dp.fetch_jobs(self.cleaned_resume) self.current_employers,self.employers = lp.fetch_employers( self.raw_text,self.job_positions) + self.extra_info = dp.fetch_extra(self.raw_text) # TODO: Add more fetch here def show(self): @@ -67,5 +68,6 @@ def show(self): "employers" : self.employers, "current_employers" : self.current_employers, "qualifications" : self.qualifications, - "qualifications_info" : self.degree_info + "qualifications_info" : self.degree_info, + "extra_info" : self.extra_info } \ No newline at end of file diff --git a/cvscan/data/extra/extra b/cvscan/data/extra/extra new file mode 100644 index 0000000..e69de29 diff --git a/cvscan/details_parser.py b/cvscan/details_parser.py index 23d73ae..84ba399 100644 --- a/cvscan/details_parser.py +++ b/cvscan/details_parser.py @@ -257,9 +257,30 @@ def fetch_jobs(cleaned_resume): """ -Utility function that fetches degree from the resume. +Utility function that fetches the skills from resume +Params: cleaned_resume Type: string +returns: skill_set Type: List + +""" +def fetch_skills(cleaned_resume): + with open(dirpath.PKGPATH + '/data/skills/skills','rb') as fp: + skills = pickle.load(fp) + + skill_set = [] + for skill in skills: + skill = ' '+skill+' ' + if skill.lower() in cleaned_resume: + skill_set.append(skill) + return skill_set + + +""" + +Utility function that fetches degree and degree-info from the resume. Params: resume_text Type: string -returns: degree Type: List of strings +returns: +degree Type: List of strings +info Type: List of strings """ def fetch_qualifications(resume_text): @@ -281,4 +302,27 @@ def fetch_qualifications(resume_text): if lines: info.append(lines[0]) regex_result = re.search(regular_expression,resume_text) - return degree,info \ No newline at end of file + return degree,info + + +""" + +Utility function that fetches extra information from the resume. +Params: resume_text Type: string +returns: extra_information Type: List of strings + +""" +def fetch_extra(resume_text): + with open(dirpath.PKGPATH + '/data/extra/extra', 'rb') as fp: + extra = pickle.load(fp) + + extra_information = [] + for info in extra: + extra_regex = r'[^a-zA-Z]'+info+r'[^a-zA-Z]' + regular_expression = re.compile(extra_regex,re.IGNORECASE) + regex_result = re.search(regular_expression,resume_text) + while regex_result: + extra_information.append(info) + resume_text = resume_text[regex_result.end():] + regex_result = re.search(regular_expression,resume_text) + return extra_information \ No newline at end of file diff --git a/cvscan/language_parser.py b/cvscan/language_parser.py index 97308e8..ad3aef3 100644 --- a/cvscan/language_parser.py +++ b/cvscan/language_parser.py @@ -49,30 +49,6 @@ def clean_resume(resume_text): return cleaned_resume -""" -TODO: move this function to the details parser as stem isn't used - -Utility function that fetches the skills from resume -Params: cleaned_resume Type: string -returns: skill_set Type: List - -""" -def fetch_skills(cleaned_resume): - with open(dirpath.PKGPATH + '/data/skills/skills','rb') as fp: - skills = pickle.load(fp) - - skill_set = [] - for skill in skills: - # stem_skill = skill.split() - # for word in skill: - # stem_skill.append(stemmer.stem(word)) - # stem_skill = ' '.join(stem_skill) - skill = ' '+skill+' ' - if skill.lower() in cleaned_resume: - skill_set.append(skill) - return skill_set - - """ Util function for fetch_employers module to get all the From 1ce58c8f6ea8f16b3ef1a688b43d4e1bcfe07ed1 Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 14:33:36 +0530 Subject: [PATCH 5/9] extra information feature added --- cvscan/cli/cli.py | 17 +++++++++++---- cvscan/data/extra/extra | 2 ++ cvscan/data_operations.py | 46 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/cvscan/cli/cli.py b/cvscan/cli/cli.py index 6c67ef1..23c9eee 100644 --- a/cvscan/cli/cli.py +++ b/cvscan/cli/cli.py @@ -43,7 +43,8 @@ def parse(name): @click.option('--skill','-s',help='Add skills') @click.option('--job','-j',help='For adding jobs: -j ') @click.option('--qual','-q',help="Add qualifications") -def add(org,skill,job,qual): +@click.option('--extra','-e',help = "Add Extra information") +def add(org,skill,job,qual,extra): """ Add data to be considered\n @@ -62,6 +63,8 @@ def add(org,skill,job,qual): For adding qualification:\n cvscan add --qual \n punctuations before the first and after the last alphabet are excluded\n + For adding extra information:\n + cvscan add --extra \n The above can be combined together also. Eg:\n cvscan add -o -s is also valid @@ -81,14 +84,16 @@ def add(org,skill,job,qual): do.add_jobs(jobs) if qual: do.add_qualifications(qual.split(',')) - + if extra: + do.add_extra(extra.split(',')) @main.command() @click.option('--org','-o',help='Explicitly remove organizations') @click.option('--skill','-s',help='Remove skills') @click.option('--job','-j',help='For removing jobs -j ') @click.option('--qual','-q',help="Remove qualifications") -def remove(org,skill,job,qual): +@click.option('--extra','-e',help = "Remove Extra information") +def remove(org,skill,job,qual,extra): """ Remove data from consideration\n @@ -107,6 +112,8 @@ def remove(org,skill,job,qual): For removing qualification:\n cvscan remove -q \n punctuations before the first and after the last alphabet are excluded\n + For removing extra information:\n + cvscan remove -e \n The above can be combined together also. Eg:\n cvscan remove -o -s -j is also valid @@ -119,4 +126,6 @@ def remove(org,skill,job,qual): if job: do.remove_jobs(job.split(',')) if qual: - do.remove_qualifications(qual.split(',')) \ No newline at end of file + do.remove_qualifications(qual.split(',')) + if extra: + do.remove_extra(extra.split(',')) \ No newline at end of file diff --git a/cvscan/data/extra/extra b/cvscan/data/extra/extra index e69de29..eaad8fc 100644 --- a/cvscan/data/extra/extra +++ b/cvscan/data/extra/extra @@ -0,0 +1,2 @@ +(lp0 +. \ No newline at end of file diff --git a/cvscan/data_operations.py b/cvscan/data_operations.py index 08934a0..0aafe9d 100644 --- a/cvscan/data_operations.py +++ b/cvscan/data_operations.py @@ -216,4 +216,48 @@ def remove_qualifications(quals): with open(DATAPATH + 'qualifications/degree','wb') as fp: pickle.dump(qualifications, fp) - logging.debug("degree file written") \ No newline at end of file + logging.debug("degree file written") + + +""" + +An Utility function to add extra information to the extra file. +Params: extra_info Type: List of String +extra_info are case-sensitive. + +""" +def add_extra(extra_info): + with open(DATAPATH + 'extra/extra','rb') as fp: + extra = pickle.load(fp) + logging.debug("extra file loaded") + + for e in extra_info: + if e not in extra: + extra.append(e) + logging.debug(e + " added to extra information") + + with open(DATAPATH + 'extra/extra','wb') as fp: + pickle.dump(extra, fp) + logging.debug("extra file written") + + +""" + +An Utility function to remove extra information from the extra file. +Params: extra_info Type: List of String +Extra informations are case-sensitive. + +""" +def remove_extra(extra_info): + with open(DATAPATH + 'extra/extra','rb') as fp: + extra = pickle.load(fp) + logging.debug("extra file loaded") + + for e in extra_info: + if e in extra: + extra.remove(e) + logging.debug(e + " removed from extra information") + + with open(DATAPATH + 'extra/extra','wb') as fp: + pickle.dump(extra, fp) + logging.debug("extra file written") From b458776539abc18c74d7c13658968103eac0d20e Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 16:47:20 +0530 Subject: [PATCH 6/9] TODOs added --- README.md | 53 +++++++++++++++++- cvscan/__init__.py | 4 +- cvscan/details_parser.py | 68 +++++++++++------------ cvscan/language_parser.py | 113 ++++++++++++++++++++++++++++---------- 4 files changed, 170 insertions(+), 68 deletions(-) diff --git a/README.md b/README.md index cd69c84..c7646c6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# cvscan + +### Attributes +| Attributes | Functions | +|---------------------|-----------| +|path | Stores the path of the resume | +|raw_text | Stores the resume as raw text | +|URLs | Stores all the URLs from the resume | +|name | Applicant's name | +|emails | Applicant's email | +|Phone number | Applicant's contact number | +|address | Applicant's address | +|experience | Applicant's experience in years | +|cleaned_resume | raw_text after removing english stopwords | +|skills | Applicant's skillset | +|qualifications | Applicant's qualifications | +|degree_info | info about qualification | +| + +## configurations.py +Contains the regular expressions used throughout the project +## converter.py +Contains methods to convert resume from input format to raw text +#### pdf_to_text +Uses pdfminer library to fetch raw text from the resume. Special characters and bullets in the resume are replaced with a newline character. +This formatted text from the resume is returned. diff --git a/cvscan/__init__.py b/cvscan/__init__.py index 27ff7ce..ca68926 100644 --- a/cvscan/__init__.py +++ b/cvscan/__init__.py @@ -35,7 +35,7 @@ def extract(self): if self.raw_text is not '': self.parse() else: - raise ValueError("Error parsing resume.") + raise ValueError("Error extracting resume text.") def parse(self): self.URLs = annotations_parser.fetch_pdf_urls(self.path) @@ -70,4 +70,4 @@ def show(self): "qualifications" : self.qualifications, "qualifications_info" : self.degree_info, "extra_info" : self.extra_info - } \ No newline at end of file + } diff --git a/cvscan/details_parser.py b/cvscan/details_parser.py index 84ba399..555d131 100644 --- a/cvscan/details_parser.py +++ b/cvscan/details_parser.py @@ -29,7 +29,7 @@ """ def fetch_email(resume_text): try: - regular_expression = re.compile(regex.email,re.IGNORECASE) + regular_expression = re.compile(regex.email, re.IGNORECASE) emails = [] result = re.search(regular_expression, resume_text) while result: @@ -51,7 +51,7 @@ def fetch_email(resume_text): """ def fetch_phone(resume_text): try: - regular_expression = re.compile(regex.get_phone(3,3,10),re.IGNORECASE) + regular_expression = re.compile(regex.get_phone(3, 3, 10), re.IGNORECASE) result = re.search(regular_expression, resume_text) phone = '' if result: @@ -60,9 +60,9 @@ def fetch_phone(resume_text): if part: phone += part if phone is '': - for i in range(1,10): - for j in range(1,10-i): - regular_expression =re.compile(regex.get_phone(i,j,10),re.IGNORECASE) + for i in range(1, 10): + for j in range(1, 10-i): + regular_expression =re.compile(regex.get_phone(i, j, 10), re.IGNORECASE) result = re.search(regular_expression, resume_text) if result: result = result.groups() @@ -82,7 +82,7 @@ def fetch_phone(resume_text): Utility function that fetches address in the resume. Params: resume_text type: string -returns: address type:dictionary keys:district,state,pincode +returns: address type:dictionary keys:district, state, pincode """ def fetch_address(resume_text): @@ -99,11 +99,11 @@ def fetch_address(resume_text): with open(pincode_input_path, 'rb') as fp: pincodes = pickle.load(fp) - with open(address_input_path,'rb') as fp: + with open(address_input_path, 'rb') as fp: address = pickle.load(fp) regular_expression = re.compile(regex.pincode) - regex_result = re.search(regular_expression,resume_text) + regex_result = re.search(regular_expression, resume_text) while regex_result: useful_resume_text = resume_text[:regex_result.start()].lower() pincode_tuple = regex_result.group() @@ -119,17 +119,17 @@ def fetch_address(resume_text): result_address.clear() resume_text = resume_text[regex_result.end():] - regex_result = re.search(regular_expression,resume_text) + regex_result = re.search(regular_expression, resume_text) resume_text = initial_resume_text.lower() - with open(states_input,'rb') as fp: + with open(states_input, 'rb') as fp: states = pickle.load(fp) - with open(district_state_input,'rb') as fp: + with open(district_state_input, 'rb') as fp: district_states = pickle.load(fp) # Check if the input is a separate word in resume_text - def if_separate_word(pos,word): + def if_separate_word(pos, word): if (pos != 0) and resume_text[pos-1].isalpha(): return False final_pos = pos+len(word) @@ -143,12 +143,12 @@ def if_separate_word(pos,word): district_pos = len(resume_text) for state in states: pos = resume_text.find(state) - if (pos != -1) and(pos < state_pos) and if_separate_word(pos,state): + if (pos != -1) and(pos < state_pos) and if_separate_word(pos, state): state_pos = pos result_state = state for district in district_states.keys(): pos = resume_text.find(district) - if (pos != -1) and (pos < district_pos) and if_separate_word(pos,district): + if (pos != -1) and (pos < district_pos) and if_separate_word(pos, district): district_pos = pos result_district = district if (result_state is '') and (result_district is not ''): @@ -170,7 +170,7 @@ def if_separate_word(pos,word): def calculate_experience(resume_text): # def get_month_index(month): - month_dict = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12} + month_dict = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12} return month_dict[month.lower()] try: @@ -179,16 +179,16 @@ def get_month_index(month): start_year = -1 end_month = -1 end_year = -1 - regular_expression = re.compile(regex.date_range,re.IGNORECASE) + regular_expression = re.compile(regex.date_range, re.IGNORECASE) regex_result = re.search(regular_expression, resume_text) while regex_result: date_range = regex_result.group() year_regex = re.compile(regex.year) - year_result = re.search(year_regex,date_range) + year_result = re.search(year_regex, date_range) if (start_year == -1) or (int(year_result.group()) <= start_year): start_year = int(year_result.group()) - month_regex = re.compile(regex.months_short,re.IGNORECASE) - month_result = re.search(month_regex,date_range) + month_regex = re.compile(regex.months_short, re.IGNORECASE) + month_result = re.search(month_regex, date_range) if month_result: current_month = get_month_index(month_result.group()) if (start_month == -1) or (current_month < start_month): @@ -197,11 +197,11 @@ def get_month_index(month): end_month = date.today().month # current month end_year = date.today().year # current year else: - year_result = re.search(year_regex,date_range[year_result.end():]) + year_result = re.search(year_regex, date_range[year_result.end():]) if (end_year == -1) or (int(year_result.group()) >= end_year): end_year = int(year_result.group()) - month_regex = re.compile(regex.months_short,re.IGNORECASE) - month_result = re.search(month_regex,date_range) + month_regex = re.compile(regex.months_short, re.IGNORECASE) + month_result = re.search(month_regex, date_range) if month_result: current_month = get_month_index(month_result.group()) if (end_month == -1) or (current_month > end_month): @@ -231,12 +231,12 @@ def fetch_jobs(cleaned_resume): positions = [] for job in jobs.keys(): job_regex = r'[^a-zA-Z]'+job+r'[^a-zA-Z]' - regular_expression = re.compile(job_regex,re.IGNORECASE) - regex_result = re.search(regular_expression,cleaned_resume) + regular_expression = re.compile(job_regex, re.IGNORECASE) + regex_result = re.search(regular_expression, cleaned_resume) if regex_result: positions.append(regex_result.start()) job_positions.append(job.capitalize()) - job_positions = [job for (pos,job) in sorted(zip(positions,job_positions))] + job_positions = [job for (pos, job) in sorted(zip(positions, job_positions))] # For finding the most frequent job category hash_jobs = {} @@ -252,7 +252,7 @@ def fetch_jobs(cleaned_resume): hash_jobs['Student'] = 0 hash_jobs['Other'] = -1 - return (job_positions,max(hash_jobs,key=hash_jobs.get).capitalize()) + return (job_positions, max(hash_jobs, key=hash_jobs.get).capitalize()) """ @@ -263,7 +263,7 @@ def fetch_jobs(cleaned_resume): """ def fetch_skills(cleaned_resume): - with open(dirpath.PKGPATH + '/data/skills/skills','rb') as fp: + with open(dirpath.PKGPATH + '/data/skills/skills', 'rb') as fp: skills = pickle.load(fp) skill_set = [] @@ -292,8 +292,8 @@ def fetch_qualifications(resume_text): info = [] for qualification in qualifications: qual_regex = r'[^a-zA-Z]'+qualification+r'[^a-zA-Z]' - regular_expression = re.compile(qual_regex,re.IGNORECASE) - regex_result = re.search(regular_expression,resume_text) + regular_expression = re.compile(qual_regex, re.IGNORECASE) + regex_result = re.search(regular_expression, resume_text) while regex_result: degree.append(qualification) resume_text = resume_text[regex_result.end():] @@ -301,8 +301,8 @@ def fetch_qualifications(resume_text): for line in resume_text.split('\n') if line.rstrip().lstrip()] if lines: info.append(lines[0]) - regex_result = re.search(regular_expression,resume_text) - return degree,info + regex_result = re.search(regular_expression, resume_text) + return degree, info """ @@ -319,10 +319,10 @@ def fetch_extra(resume_text): extra_information = [] for info in extra: extra_regex = r'[^a-zA-Z]'+info+r'[^a-zA-Z]' - regular_expression = re.compile(extra_regex,re.IGNORECASE) - regex_result = re.search(regular_expression,resume_text) + regular_expression = re.compile(extra_regex, re.IGNORECASE) + regex_result = re.search(regular_expression, resume_text) while regex_result: extra_information.append(info) resume_text = resume_text[regex_result.end():] - regex_result = re.search(regular_expression,resume_text) + regex_result = re.search(regular_expression, resume_text) return extra_information \ No newline at end of file diff --git a/cvscan/language_parser.py b/cvscan/language_parser.py index ad3aef3..c848c40 100644 --- a/cvscan/language_parser.py +++ b/cvscan/language_parser.py @@ -34,9 +34,9 @@ def clean_resume(resume_text): cleaned_resume = [] # replacing newlines and punctuations with space - resume_text =resume_text.replace('\t',' ').replace('\n',' ') + resume_text =resume_text.replace('\t', ' ').replace('\n', ' ') for punctuation in string.punctuation: - resume_text = resume_text.replace(punctuation,' ') + resume_text = resume_text.replace(punctuation, ' ') resume_text = resume_text.split() # removing stop words and Stemming the remaining words in the resume @@ -60,26 +60,60 @@ def clean_resume(resume_text): def fetch_all_organizations(resume_text): organizations = set() tokenized_sentences = nltk.sent_tokenize(resume_text) + + # Custom grammar with NLTK + # NP - Noun Phrase + # NN - Noun + # NNP - Proper Noun + # V - Verb + # JJ - Adjective + + # In a sentence that contains NN NNNP V NN NN JJ NN. + # The noun-phrases fetched are: + # NP: NN NNP + # NP: NN NN + # NP: NN + + # Ex, "Application Developer at Delta Force" + # => ["Application Developer", "Delta Force"] + grammar = r"""NP: {+}""" parser = nltk.RegexpParser(grammar) for sentence in tokenized_sentences: + + # tags all parts of speech in the tokenized sentences tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence)) + # then chunks with customize grammar + # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) + + with open(dirpath.PKGPATH + + '/data/organizations/avoid_organizations') as fp: + avoid_organizations = pickle.load(fp) + + noun_phrases = [] for np_chunk in np_chunks: - if isinstance(np_chunk,nltk.tree.Tree) and np_chunk.label() == 'NP': - noun_phrase = ' '.join([org for (org,tag) in np_chunk.leaves()]) - noun_phrases.append(noun_phrase) - # print noun_phrases + if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP': + # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree + noun_phrase = "" + for (org, tag) in np_chunk.leaves(): + noun_phrase += org + ' ' + noun_phrases.append(noun_phrase.rsplit()) + + # Using name entity chunker to get all the organizations chunks = nltk.ne_chunk(tagged_words) for chunk in chunks: - if hasattr(chunk,'label') and chunk.label() == 'ORGANIZATION': - (organization,tag) = chunk[0] + if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION': + (organization, tag) = chunk[0] + + # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name + # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase for noun_phrase in noun_phrases: - if organization in noun_phrase: + if organization in noun_phrase and organization not in avoid_organizations: organizations.add(noun_phrase.capitalize()) return organizations @@ -101,15 +135,18 @@ def fetch_employers_util(resume_text, job_positions, organizations, priority): current_employers = [] employers = [] for job in job_positions: + # TODO: remove priority + # TODO: move regex to config job_regex = r'[^a-zA-Z]'+job+r'[^a-zA-Z]' regular_expression = re.compile(job_regex, re.IGNORECASE) temp_resume = resume_text - regex_result = re.search(regular_expression,temp_resume) + regex_result = re.search(regular_expression, temp_resume) while regex_result: # start to end point to a line before and after the job positions line # along with the job line start = regex_result.start() end = regex_result.end() + # TODO put 3 in config lines_front = lines_back = 3 while lines_front != 0 and start != 0: if temp_resume[start] == '.': @@ -119,8 +156,10 @@ def fetch_employers_util(resume_text, job_positions, organizations, priority): if temp_resume[end] == '.': lines_back -= 1 end += 1 + + # Read from temp_resume with start and end as positions line = temp_resume[start:end].lower() - # print line + for org in organizations: if org.lower() in line and org.lower() not in job_positions: if 'present' in line: @@ -129,17 +168,19 @@ def fetch_employers_util(resume_text, job_positions, organizations, priority): employers.remove(org.capitalize()) if org.capitalize() not in current_employers: if priority: - current_employers.insert(0,org.capitalize()) + current_employers.insert(0, org.capitalize()) else: current_employers.append(org.capitalize()) elif org.capitalize() not in employers: if priority: - employers.insert(0,org.capitalize()) + employers.insert(0, org.capitalize()) else: employers.append(org.capitalize()) + temp_resume = temp_resume[end:] - regex_result = re.search(regular_expression,temp_resume) - return (current_employers,employers) + regex_result = re.search(regular_expression, temp_resume) + + return (current_employers, employers) """ @@ -151,34 +192,46 @@ def fetch_employers_util(resume_text, job_positions, organizations, priority): """ def fetch_employers(resume_text, job_positions): + + # Cleaning up the text. + # 1. Initially convert all punctuations to '\n' + # 2. Split the resume using '\n' and add non-empty lines to temp_resume + # 3. join the temp_resume using dot-space + for punctuation in string.punctuation: - resume_text = resume_text.replace(punctuation,'\n') - resume_text = '. '.join([x for x in resume_text.split('\n') - if len(x.rstrip().lstrip())!=0]) - with open(dirpath.PKGPATH + - '/data/organizations/avoid_organizations') as fp: - avoid_organizations = pickle.load(fp) + resume_text = resume_text.replace(punctuation, '\n') + + temp_resume = [] + for x in resume_text.split('\n'): + # append only if there is text + if x.rstrip(): + temp_resume.append(x) + + # joined with dot-space + resume_text = '. '.join(temp_resume) current_employers = [] employers = [] - organizations = [org for org in fetch_all_organizations(resume_text) - if org not in avoid_organizations] - cur_emps,emps = fetch_employers_util(resume_text, job_positions, - organizations,False) + organizations = fetch_all_organizations(resume_text) + + cur_emps, emps = fetch_employers_util(resume_text, job_positions, + organizations, False) current_employers.extend(cur_emps) employers.extend(emps) with open(dirpath.PKGPATH + '/data/organizations/explicit_organizations') as fp: organizations = pickle.load(fp) - cur_emps,emps = fetch_employers_util(resume_text, job_positions, - organizations,True) + + cur_emps, emps = fetch_employers_util(resume_text, job_positions, + organizations, True) + current_employers.extend([emp for emp in cur_emps if emp not in current_employers]) employers.extend([emp for emp in emps if emp not in employers]) - return current_employers,employers + return current_employers, employers """ @@ -196,9 +249,9 @@ def fetch_name(resume_text): tokenized_sentences = nltk.sent_tokenize(resume_text) for sentence in tokenized_sentences: for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')): - if hasattr(chunk,'label'):# and chunk.label() == 'PERSON': + if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON': chunk = chunk[0] - (name,tag) = chunk + (name, tag) = chunk if tag == 'NOUN': return name From 43f42e7c6a06b775c17c1629dbf603bd3ebac394 Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 17:04:27 +0530 Subject: [PATCH 7/9] updated README.md --- README.md | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index c7646c6..edcf454 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ - ### Attributes -| Attributes | Functions | +| Attributes | Function | |---------------------|-----------| |path | Stores the path of the resume | |raw_text | Stores the resume as raw text | @@ -98,12 +103,16 @@ cvscan.show() |Phone number | Applicant's contact number | |address | Applicant's address | |experience | Applicant's experience in years | -|cleaned_resume | raw_text after removing english stopwords | +|cleaned_resume | Raw text after removing english stopwords | |skills | Applicant's skillset | |qualifications | Applicant's qualifications | -|degree_info | info about qualification | -| - +|degree_info | Info about qualification | +|job_positions | Applicant's jobs | +|category | Applicant's Job category | +|current_employers | Organization applicant is working in | +|employers | All organizations applicant has worked in | +|extra_info | Extra information about the applicant| + From ca1357f8b684934189040e3ab160242cd8896118 Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 17:36:41 +0530 Subject: [PATCH 8/9] added utilities file --- cvscan/language_parser.py | 37 +++++++++++-------------------------- cvscan/utilities.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 26 deletions(-) create mode 100644 cvscan/utilities.py diff --git a/cvscan/language_parser.py b/cvscan/language_parser.py index c848c40..aeaf285 100644 --- a/cvscan/language_parser.py +++ b/cvscan/language_parser.py @@ -12,7 +12,7 @@ from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer -import dirpath +import utilities logging.basicConfig(level=logging.DEBUG) @@ -89,10 +89,7 @@ def fetch_all_organizations(resume_text): # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) - with open(dirpath.PKGPATH + - '/data/organizations/avoid_organizations') as fp: - avoid_organizations = pickle.load(fp) - + avoid_organizations = utilities.get_avoid_organizations() noun_phrases = [] for np_chunk in np_chunks: @@ -131,23 +128,22 @@ def fetch_all_organizations(resume_text): all_employers Type: List of strings """ -def fetch_employers_util(resume_text, job_positions, organizations, priority): +def fetch_employers_util(resume_text, job_positions, organizations): current_employers = [] employers = [] for job in job_positions: - # TODO: remove priority - # TODO: move regex to config job_regex = r'[^a-zA-Z]'+job+r'[^a-zA-Z]' regular_expression = re.compile(job_regex, re.IGNORECASE) temp_resume = resume_text regex_result = re.search(regular_expression, temp_resume) while regex_result: + # start to end point to a line before and after the job positions line # along with the job line start = regex_result.start() end = regex_result.end() - # TODO put 3 in config - lines_front = lines_back = 3 + lines_front = utilities.LINES_FRONT + lines_back = utilities.LINES_BACK while lines_front != 0 and start != 0: if temp_resume[start] == '.': lines_front -= 1 @@ -163,19 +159,12 @@ def fetch_employers_util(resume_text, job_positions, organizations, priority): for org in organizations: if org.lower() in line and org.lower() not in job_positions: if 'present' in line: - # print org if org.capitalize() in employers: employers.remove(org.capitalize()) if org.capitalize() not in current_employers: - if priority: - current_employers.insert(0, org.capitalize()) - else: - current_employers.append(org.capitalize()) + current_employers.append(org.capitalize()) elif org.capitalize() not in employers: - if priority: - employers.insert(0, org.capitalize()) - else: - employers.append(org.capitalize()) + employers.append(org.capitalize()) temp_resume = temp_resume[end:] regex_result = re.search(regular_expression, temp_resume) @@ -212,19 +201,15 @@ def fetch_employers(resume_text, job_positions): current_employers = [] employers = [] - organizations = fetch_all_organizations(resume_text) cur_emps, emps = fetch_employers_util(resume_text, job_positions, - organizations, False) + utilities.get_organizations()) + current_employers.extend(cur_emps) employers.extend(emps) - with open(dirpath.PKGPATH + - '/data/organizations/explicit_organizations') as fp: - organizations = pickle.load(fp) - cur_emps, emps = fetch_employers_util(resume_text, job_positions, - organizations, True) + fetch_all_organizations(resume_text)) current_employers.extend([emp for emp in cur_emps if emp not in current_employers]) diff --git a/cvscan/utilities.py b/cvscan/utilities.py new file mode 100644 index 0000000..2ef0a84 --- /dev/null +++ b/cvscan/utilities.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +""" + +Contains all the constants and utility functions used through out the project + +""" + +import pickle + +import dirpath + +__author__ = 'lakshmanaram' +__license__ = 'http://opensource.org/licenses/MIT' +__email__ = 'lakshmanaram.n@gmail.com' +__maintainer__ = 'lakshmanaram' + +# Constants + +LINES_FRONT = 3 +LINES_BACK = 3 + +# Methods + +def get_avoid_organizations(): + with open(dirpath.PKGPATH + + '/data/organizations/avoid_organizations') as fp: + avoid_organizations = pickle.load(fp) + return avoid_organizations + +def get_organizations(): + with open(dirpath.PKGPATH + + '/data/organizations/explicit_organizations') as fp: + organizations = pickle.load(fp) + return organizations \ No newline at end of file From d8c0f4719e3abf1ef3a5c36276721fab251ca511 Mon Sep 17 00:00:00 2001 From: lakshmanaram Date: Fri, 30 Dec 2016 17:39:55 +0530 Subject: [PATCH 9/9] updated typo --- cvscan/language_parser.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cvscan/language_parser.py b/cvscan/language_parser.py index aeaf285..2edf892 100644 --- a/cvscan/language_parser.py +++ b/cvscan/language_parser.py @@ -80,6 +80,8 @@ def fetch_all_organizations(resume_text): grammar = r"""NP: {+}""" parser = nltk.RegexpParser(grammar) + avoid_organizations = utilities.get_avoid_organizations() + for sentence in tokenized_sentences: # tags all parts of speech in the tokenized sentences @@ -88,10 +90,8 @@ def fetch_all_organizations(resume_text): # then chunks with customize grammar # np_chunks are instances of class nltk.tree.Tree np_chunks = parser.parse(tagged_words) - - avoid_organizations = utilities.get_avoid_organizations() - noun_phrases = [] + for np_chunk in np_chunks: if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP': # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree @@ -99,7 +99,7 @@ def fetch_all_organizations(resume_text): for (org, tag) in np_chunk.leaves(): noun_phrase += org + ' ' - noun_phrases.append(noun_phrase.rsplit()) + noun_phrases.append(noun_phrase.rstrip()) # Using name entity chunker to get all the organizations chunks = nltk.ne_chunk(tagged_words) @@ -137,7 +137,7 @@ def fetch_employers_util(resume_text, job_positions, organizations): temp_resume = resume_text regex_result = re.search(regular_expression, temp_resume) while regex_result: - + # start to end point to a line before and after the job positions line # along with the job line start = regex_result.start()