-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
43 lines (36 loc) · 1.3 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import tika
tika.initVM()
from tika import parser
import nltk
import re
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
from nltk.corpus import stopwords
def clean(text):
if text is None:
print("Warning: file is not PDF or it's corrupted!")
return ''
line_words = re.sub(pattern=r'[^a-zA-Z]+', repl=' ', string=text).lower().split()
line_words = [word.lower() for word in line_words if (not word in set(stopwords.words('english')))]
return " ".join(set(line_words))
def read_pdf(file):
parsed = parser.from_buffer(file)
text = parsed["content"]
cleaned_text = clean(text)
return cleaned_text.split()
def get_score(skills, resume_text):
# Regular expression to detect words
words = re.findall(r'\b[a-zA-Z]+\b', skills)
matched_skill_count = 0
for skill in words:
if skill.lower() in resume_text:
matched_skill_count +=1
if len(skills) == 0:
return 0.0
matching_score =int((matched_skill_count / len(words)) * 100)
return matching_score
#print(f"Matching score: {matching_score:.2f}%")
# test = 'C, Python, Java, Git, Github, Django, Machine learning, Shell scripting, Linux, Networking, Operating System, HTML, CSS'
# print(get_score(test,read_pdf('valid_resume.pdf')))