Spaces:
Sleeping
Sleeping
Jeet Paul
commited on
Commit
·
2bb2e2d
1
Parent(s):
e3fd35d
Upload 2 files
Browse files- nrp.py +173 -0
- requirements.txt +6 -3
nrp.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import nltk
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from nltk.tokenize import word_tokenize
|
5 |
+
from nltk.stem import PorterStemmer
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
from PyPDF2 import PdfReader
|
9 |
+
import os
|
10 |
+
from io import BytesIO
|
11 |
+
import pickle
|
12 |
+
import pdfminer
|
13 |
+
from pdfminer.high_level import extract_text
|
14 |
+
import re
|
15 |
+
import PyPDF2
|
16 |
+
import docx
|
17 |
+
import textract
|
18 |
+
|
19 |
+
nltk.download('punkt')
|
20 |
+
nltk.download('stopwords')
|
21 |
+
|
22 |
+
def preprocess_text(text):
|
23 |
+
words = word_tokenize(text.lower())
|
24 |
+
|
25 |
+
stop_words = set(stopwords.words('english'))
|
26 |
+
words = [word for word in words if word not in stop_words]
|
27 |
+
|
28 |
+
stemmer = PorterStemmer()
|
29 |
+
words = [stemmer.stem(word) for word in words]
|
30 |
+
|
31 |
+
return ' '.join(words)
|
32 |
+
|
33 |
+
import textract
|
34 |
+
import tempfile
|
35 |
+
|
36 |
+
import fitz # PyMuPDF
|
37 |
+
|
38 |
+
def extract_text_from_pdf(pdf_content):
|
39 |
+
pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
|
40 |
+
text = ""
|
41 |
+
for page_number in range(pdf_document.page_count):
|
42 |
+
page = pdf_document[page_number]
|
43 |
+
text += page.get_text()
|
44 |
+
pdf_document.close()
|
45 |
+
return text
|
46 |
+
|
47 |
+
from docx import Document
|
48 |
+
|
49 |
+
def extract_text_from_docx(docx_content):
|
50 |
+
doc = Document(BytesIO(docx_content))
|
51 |
+
text = " ".join(paragraph.text for paragraph in doc.paragraphs)
|
52 |
+
return text
|
53 |
+
|
54 |
+
|
55 |
+
def extract_text_from_txt(txt_content):
|
56 |
+
text = textract.process(input_filename=None, input_bytes=txt_content)
|
57 |
+
return text
|
58 |
+
|
59 |
+
|
60 |
+
def extract_text_from_resume(file_path):
|
61 |
+
file_extension = file_path.split('.')[-1].lower()
|
62 |
+
|
63 |
+
if file_extension == 'pdf':
|
64 |
+
return extract_text_from_pdf(file_path)
|
65 |
+
elif file_extension == 'docx':
|
66 |
+
return extract_text_from_docx(file_path)
|
67 |
+
elif file_extension == 'txt':
|
68 |
+
return extract_text_from_txt(file_path)
|
69 |
+
else:
|
70 |
+
raise ValueError(f"Unsupported file format: {file_extension}")
|
71 |
+
|
72 |
+
def clean_pdf_text(text):
|
73 |
+
text = re.sub('http\S+\s*', ' ', text)
|
74 |
+
text = re.sub('RT|cc', ' ', text)
|
75 |
+
text = re.sub('#\S+', '', text)
|
76 |
+
text = re.sub('@\S+', ' ', text)
|
77 |
+
text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
|
78 |
+
text = re.sub(r'[^\x00-\x7f]',r' ', text)
|
79 |
+
text = re.sub('\s+', ' ', text)
|
80 |
+
return text
|
81 |
+
|
82 |
+
def extract_candidate_name(text):
|
83 |
+
pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
|
84 |
+
match = re.search(pattern, text)
|
85 |
+
if match:
|
86 |
+
return match.group(0)
|
87 |
+
return "Candidate Name Not Found"
|
88 |
+
|
89 |
+
def calculate_similarity(job_description, cvs, cv_file_names):
|
90 |
+
processed_job_desc = preprocess_text(job_description)
|
91 |
+
|
92 |
+
processed_cvs = [preprocess_text(cv) for cv in cvs]
|
93 |
+
|
94 |
+
all_text = [processed_job_desc] + processed_cvs
|
95 |
+
|
96 |
+
vectorizer = TfidfVectorizer()
|
97 |
+
tfidf_matrix = vectorizer.fit_transform(all_text)
|
98 |
+
|
99 |
+
similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]
|
100 |
+
|
101 |
+
ranked_cvs = list(zip(cv_file_names, similarity_scores))
|
102 |
+
ranked_cvs.sort(key=lambda x: x[1], reverse=True)
|
103 |
+
|
104 |
+
return ranked_cvs
|
105 |
+
|
106 |
+
def rank_and_shortlist(job_description, cv_files, threshold=0.15):
|
107 |
+
cv_texts = []
|
108 |
+
cv_file_names = []
|
109 |
+
|
110 |
+
for cv_file in cv_files:
|
111 |
+
file_extension = os.path.splitext(cv_file.name)[1].lower()
|
112 |
+
|
113 |
+
try:
|
114 |
+
if file_extension == '.pdf':
|
115 |
+
cv_text = extract_text_from_pdf(cv_file.read())
|
116 |
+
elif file_extension == '.docx':
|
117 |
+
cv_text = extract_text_from_docx(cv_file.read())
|
118 |
+
elif file_extension == '.txt':
|
119 |
+
cv_text = cv_file.read().decode('utf-8', errors='ignore')
|
120 |
+
else:
|
121 |
+
st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}")
|
122 |
+
continue
|
123 |
+
|
124 |
+
cv_texts.append(clean_pdf_text(cv_text))
|
125 |
+
cv_file_names.append(cv_file.name)
|
126 |
+
|
127 |
+
except Exception as e:
|
128 |
+
st.warning(f"Error processing file '{cv_file.name}': {str(e)}")
|
129 |
+
continue
|
130 |
+
|
131 |
+
if not cv_texts:
|
132 |
+
st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).")
|
133 |
+
return [], []
|
134 |
+
|
135 |
+
similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names)
|
136 |
+
|
137 |
+
ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
|
138 |
+
shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold]
|
139 |
+
|
140 |
+
return ranked_cvs, shortlisted_cvs
|
141 |
+
|
142 |
+
|
143 |
+
def main():
|
144 |
+
st.title("Resume Ranking App")
|
145 |
+
|
146 |
+
st.write("Upload the Job Description:")
|
147 |
+
job_description = st.text_area("Job Description", height=200, key='job_description')
|
148 |
+
|
149 |
+
st.write("Upload the Resumes :")
|
150 |
+
cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files')
|
151 |
+
|
152 |
+
if st.button("Submit"):
|
153 |
+
if job_description and cv_files:
|
154 |
+
# Rank and shortlist candidates
|
155 |
+
ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files)
|
156 |
+
|
157 |
+
# Display ranking with larger text
|
158 |
+
st.markdown("### Ranking of Resumes:")
|
159 |
+
for rank, score in ranked_cvs:
|
160 |
+
st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
|
161 |
+
|
162 |
+
# Display shortlisted candidates with larger text
|
163 |
+
st.markdown("### Shortlisted Candidates:")
|
164 |
+
if not shortlisted_cvs: # Check if the shortlisted_cvs list is empty
|
165 |
+
st.markdown("None")
|
166 |
+
else:
|
167 |
+
for rank, score in shortlisted_cvs:
|
168 |
+
st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
|
169 |
+
else:
|
170 |
+
st.write("Please upload both the job description and resumes to proceed.")
|
171 |
+
|
172 |
+
if __name__ == "__main__":
|
173 |
+
main()
|
requirements.txt
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
-
|
|
|
2 |
nltk
|
3 |
-
|
4 |
PyPDF2
|
5 |
-
|
|
|
|
|
|
1 |
+
docx
|
2 |
+
fitz
|
3 |
nltk
|
4 |
+
pdfminer.six
|
5 |
PyPDF2
|
6 |
+
scikit-learn
|
7 |
+
streamlit
|
8 |
+
textract
|