Spaces:

bangaboy
/

resume_parser

Sleeping

App Files Files Community

resume_parser / app.py

bangaboy

Update app.py

1d58793 verified about 2 months ago

raw

history blame contribute delete

8.04 kB


	import google.generativeai as genai
	import fitz # PyMuPDF for PDF text extraction
	import streamlit as st
	import spacy
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
	from docx import Document
	import re
	import dateparser
	from datetime import datetime
	import os
	from typing import List, Dict
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Load SpaCy model for dependency parsing and NER
	nlp_spacy = spacy.load('en_core_web_sm')

	# Load the NER model
	tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
	model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
	nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	def authenticate_gemini() -> genai.GenerativeModel:
	api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
	if not api_key:
	st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
	return None
	try:
	genai.configure(api_key=api_key)
	model = genai.GenerativeModel(model_name="gemini-pro")
	st.success("Gemini API successfully configured.")
	return model
	except Exception as e:
	logger.error(f"Error configuring Gemini API: {e}")
	st.error(f"Error configuring Gemini API. Please check your API key and try again.")
	return None

	def refine_org_entities(entities: List[str]) -> List[str]:
	refined_entities = set()
	company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.', 'Company', 'Group']

	for entity in entities:
	# Remove common prefixes that might interfere with company names
	entity = re.sub(r'^(The\|A\|An)\s+', '', entity).strip()

	if any(entity.endswith(suffix) for suffix in company_suffixes):
	refined_entities.add(entity)
	elif re.match(r'([A-Z][a-z]+\s?)+', entity): # Match sequences of capitalized words
	refined_entities.add(entity)

	return list(refined_entities)

	def extract_orgs(text: str) -> List[str]:
	ner_results = nlp_ner(text)
	orgs = set()
	for entity in ner_results:
	if entity['entity_group'] == 'ORG':
	orgs.add(entity['word'])
	return refine_org_entities(orgs)

	def extract_text_from_pdf(pdf_file) -> str:
	try:
	doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
	text = ""
	for page_num in range(doc.page_count):
	page = doc.load_page(page_num)
	text += page.get_text()
	return text
	except Exception as e:
	logger.error(f"Error extracting text from PDF: {e}")
	return ""

	def extract_text_from_doc(doc_file) -> str:
	try:
	doc = Document(doc_file)
	text = '\n'.join([para.text for para in doc.paragraphs])
	return text
	except Exception as e:
	logger.error(f"Error extracting text from DOCX: {e}")
	return ""

	def generate_summary(text: str, model: genai.GenerativeModel) -> str:
	prompt = f"Summarize the following resume in 100 words, highlighting key skills and experiences:\n\n{text}"
	try:
	response = model.generate_content(prompt)
	return response.text
	except Exception as e:
	logger.error(f"Error generating summary: {e}")
	return "Error generating summary. Please try again."

	def extract_experience(text: str) -> str:
	# Patterns to match experience in years and months
	experience_patterns = [
	r'(\d+)\s*(?:years?\|yrs?)', # e.g., 5 years, 2 yrs
	r'(\d+)\s*(?:months?\|mos?)', # e.g., 6 months
	r'(\d+)\s(?:years?\|yrs?)\s(?:and)?\s(\d+)\s(?:months?\|mos?)' # e.g., 2 years and 6 months
	]

	# Extract and prioritize years of experience
	total_years = 0
	for pattern in experience_patterns:
	matches = re.findall(pattern, text, re.IGNORECASE)
	for match in matches:
	if len(match) == 1: # Only years or months
	value = int(match[0])
	if 'year' in pattern:
	total_years += value
	# We ignore months in this case
	elif len(match) == 2: # Years and months
	years, _ = int(match[0]), int(match[1])
	total_years += years

	# Return only the number of years (ignore months)
	if total_years > 0:
	return f"{total_years} years"
	else:
	return "Experience not found"


	def extract_phone(text: str) -> str:
	phone_patterns = [
	r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)\|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
	r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
	]
	for pattern in phone_patterns:
	match = re.search(pattern, text)
	if match:
	return match.group()
	return "Not found"

	def extract_email(text: str) -> str:
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	match = re.search(email_pattern, text)
	return match.group() if match else "Not found"

	def extract_colleges(doc) -> List[str]:
	colleges = set()
	edu_keywords = ["university", "college", "institute", "school"]
	for ent in doc.ents:
	if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
	colleges.add(ent.text)
	return list(colleges)

	def extract_linkedin(text: str) -> str:
	linkedin_patterns = [
	r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?',
	r'linkedin\.com\/in\/[A-z0-9_-]+',
	r'@[A-z0-9_-]+\s+\(LinkedIn\)'
	]
	for pattern in linkedin_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	return match.group()
	return "Not found"

	def analyze_resume(text: str, model: genai.GenerativeModel) -> Dict:
	doc = nlp_spacy(text)
	return {
	"companies": extract_orgs(text),
	"summary": generate_summary(text, model),
	"experience": extract_experience(text),
	"phone": extract_phone(text),
	"email": extract_email(text),
	"colleges": extract_colleges(doc),
	"linkedin": extract_linkedin(text)
	}

	def main():
	st.title("Enhanced Resume Analyzer")
	st.write("Upload a resume to extract information, generate a summary, and analyze details.")

	model = authenticate_gemini()
	if model is None:
	return

	uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])

	if uploaded_file is not None:
	try:
	file_ext = uploaded_file.name.split('.')[-1].lower()
	if file_ext == 'pdf':
	resume_text = extract_text_from_pdf(uploaded_file)
	elif file_ext in ['docx', 'doc']:
	resume_text = extract_text_from_doc(uploaded_file)
	else:
	st.error("Unsupported file format.")
	return

	if not resume_text.strip():
	st.error("The resume appears to be empty or couldn't be read.")
	return

	with st.spinner("Analyzing resume..."):
	results = analyze_resume(resume_text, model)

	st.subheader("Extracted Information")
	st.write(f"Experience: {results['experience']}")
	st.write("Companies Worked For:")
	st.write(", ".join(results['companies']))
	st.write(f"Phone Number: {results['phone']}")
	st.write(f"Email ID: {results['email']}")
	st.write("Colleges Attended:")
	st.write(", ".join(results['colleges']))
	st.write(f"LinkedIn: {results['linkedin']}")

	st.subheader("Generated Summary")
	st.write(results['summary'])

	except Exception as e:
	logger.error(f"Error during resume analysis: {e}")
	st.error("An error occurred during resume analysis. Please try again or contact support if the issue persists.")

	if __name__ == "__main__":
	main()