Jeet Paul commited on
Commit
2bb2e2d
·
1 Parent(s): e3fd35d

Upload 2 files

Browse files
Files changed (2) hide show
  1. nrp.py +173 -0
  2. requirements.txt +6 -3
nrp.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.stem import PorterStemmer
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from PyPDF2 import PdfReader
9
+ import os
10
+ from io import BytesIO
11
+ import pickle
12
+ import pdfminer
13
+ from pdfminer.high_level import extract_text
14
+ import re
15
+ import PyPDF2
16
+ import docx
17
+ import textract
18
+
19
+ nltk.download('punkt')
20
+ nltk.download('stopwords')
21
+
22
+ def preprocess_text(text):
23
+ words = word_tokenize(text.lower())
24
+
25
+ stop_words = set(stopwords.words('english'))
26
+ words = [word for word in words if word not in stop_words]
27
+
28
+ stemmer = PorterStemmer()
29
+ words = [stemmer.stem(word) for word in words]
30
+
31
+ return ' '.join(words)
32
+
33
+ import textract
34
+ import tempfile
35
+
36
+ import fitz # PyMuPDF
37
+
38
+ def extract_text_from_pdf(pdf_content):
39
+ pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
40
+ text = ""
41
+ for page_number in range(pdf_document.page_count):
42
+ page = pdf_document[page_number]
43
+ text += page.get_text()
44
+ pdf_document.close()
45
+ return text
46
+
47
+ from docx import Document
48
+
49
+ def extract_text_from_docx(docx_content):
50
+ doc = Document(BytesIO(docx_content))
51
+ text = " ".join(paragraph.text for paragraph in doc.paragraphs)
52
+ return text
53
+
54
+
55
+ def extract_text_from_txt(txt_content):
56
+ text = textract.process(input_filename=None, input_bytes=txt_content)
57
+ return text
58
+
59
+
60
+ def extract_text_from_resume(file_path):
61
+ file_extension = file_path.split('.')[-1].lower()
62
+
63
+ if file_extension == 'pdf':
64
+ return extract_text_from_pdf(file_path)
65
+ elif file_extension == 'docx':
66
+ return extract_text_from_docx(file_path)
67
+ elif file_extension == 'txt':
68
+ return extract_text_from_txt(file_path)
69
+ else:
70
+ raise ValueError(f"Unsupported file format: {file_extension}")
71
+
72
+ def clean_pdf_text(text):
73
+ text = re.sub('http\S+\s*', ' ', text)
74
+ text = re.sub('RT|cc', ' ', text)
75
+ text = re.sub('#\S+', '', text)
76
+ text = re.sub('@\S+', ' ', text)
77
+ text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
78
+ text = re.sub(r'[^\x00-\x7f]',r' ', text)
79
+ text = re.sub('\s+', ' ', text)
80
+ return text
81
+
82
+ def extract_candidate_name(text):
83
+ pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
84
+ match = re.search(pattern, text)
85
+ if match:
86
+ return match.group(0)
87
+ return "Candidate Name Not Found"
88
+
89
+ def calculate_similarity(job_description, cvs, cv_file_names):
90
+ processed_job_desc = preprocess_text(job_description)
91
+
92
+ processed_cvs = [preprocess_text(cv) for cv in cvs]
93
+
94
+ all_text = [processed_job_desc] + processed_cvs
95
+
96
+ vectorizer = TfidfVectorizer()
97
+ tfidf_matrix = vectorizer.fit_transform(all_text)
98
+
99
+ similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]
100
+
101
+ ranked_cvs = list(zip(cv_file_names, similarity_scores))
102
+ ranked_cvs.sort(key=lambda x: x[1], reverse=True)
103
+
104
+ return ranked_cvs
105
+
106
+ def rank_and_shortlist(job_description, cv_files, threshold=0.15):
107
+ cv_texts = []
108
+ cv_file_names = []
109
+
110
+ for cv_file in cv_files:
111
+ file_extension = os.path.splitext(cv_file.name)[1].lower()
112
+
113
+ try:
114
+ if file_extension == '.pdf':
115
+ cv_text = extract_text_from_pdf(cv_file.read())
116
+ elif file_extension == '.docx':
117
+ cv_text = extract_text_from_docx(cv_file.read())
118
+ elif file_extension == '.txt':
119
+ cv_text = cv_file.read().decode('utf-8', errors='ignore')
120
+ else:
121
+ st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}")
122
+ continue
123
+
124
+ cv_texts.append(clean_pdf_text(cv_text))
125
+ cv_file_names.append(cv_file.name)
126
+
127
+ except Exception as e:
128
+ st.warning(f"Error processing file '{cv_file.name}': {str(e)}")
129
+ continue
130
+
131
+ if not cv_texts:
132
+ st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).")
133
+ return [], []
134
+
135
+ similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names)
136
+
137
+ ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
138
+ shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold]
139
+
140
+ return ranked_cvs, shortlisted_cvs
141
+
142
+
143
+ def main():
144
+ st.title("Resume Ranking App")
145
+
146
+ st.write("Upload the Job Description:")
147
+ job_description = st.text_area("Job Description", height=200, key='job_description')
148
+
149
+ st.write("Upload the Resumes :")
150
+ cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files')
151
+
152
+ if st.button("Submit"):
153
+ if job_description and cv_files:
154
+ # Rank and shortlist candidates
155
+ ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files)
156
+
157
+ # Display ranking with larger text
158
+ st.markdown("### Ranking of Resumes:")
159
+ for rank, score in ranked_cvs:
160
+ st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
161
+
162
+ # Display shortlisted candidates with larger text
163
+ st.markdown("### Shortlisted Candidates:")
164
+ if not shortlisted_cvs: # Check if the shortlisted_cvs list is empty
165
+ st.markdown("None")
166
+ else:
167
+ for rank, score in shortlisted_cvs:
168
+ st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
169
+ else:
170
+ st.write("Please upload both the job description and resumes to proceed.")
171
+
172
+ if __name__ == "__main__":
173
+ main()
requirements.txt CHANGED
@@ -1,5 +1,8 @@
1
- streamlit
 
2
  nltk
3
- scikit-learn
4
  PyPDF2
5
- pdfminer.six
 
 
 
1
+ docx
2
+ fitz
3
  nltk
4
+ pdfminer.six
5
  PyPDF2
6
+ scikit-learn
7
+ streamlit
8
+ textract