bangaboy commited on
Commit
9d80ed8
·
verified ·
1 Parent(s): dca5336

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -19
app.py CHANGED
@@ -1,22 +1,15 @@
1
  import streamlit as st
2
- from pyngrok import ngrok
3
  import google.generativeai as genai
4
  import fitz # PyMuPDF for PDF text extraction
5
  import spacy
6
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
8
  from docx import Document
9
  import re
10
- from nltk.corpus import words
11
  import dateparser
12
  from datetime import datetime
13
  import os
14
 
15
- # Replace with your ngrok auth token
16
- ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM")
17
- url = ngrok.connect(8501)
18
- print(f"Public URL: {url}")
19
-
20
  # Load SpaCy model
21
  nlp_spacy = spacy.load('en_core_web_sm')
22
 
@@ -29,6 +22,12 @@ nlp_ner = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_
29
  gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large")
30
  gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large")
31
 
 
 
 
 
 
 
32
  class EnhancedNERPipeline:
33
  def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer):
34
  self.nlp_spacy = nlp_spacy
@@ -37,24 +36,29 @@ class EnhancedNERPipeline:
37
  self.gliner_tokenizer = gliner_tokenizer
38
 
39
  def __call__(self, text):
 
40
  doc = self.nlp_spacy(text)
 
 
41
  ner_results = self.nlp_ner(text)
 
 
42
  gliner_companies = extract_info_with_gliner(text, "company names")
43
  gliner_experience = extract_info_with_gliner(text, "years of experience")
44
  gliner_education = extract_info_with_gliner(text, "educational institutions")
45
-
 
46
  combined_entities = doc.ents + tuple(ner_results)
 
 
47
  doc._.gliner_companies = gliner_companies.split(', ')
48
  doc._.gliner_experience = gliner_experience
49
  doc._.gliner_education = gliner_education.split(', ')
 
 
50
  doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]]
51
- return doc
52
 
53
- def extract_info_with_gliner(text, info_type):
54
- input_text = f"Extract {info_type} from: {text}"
55
- input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids
56
- outputs = gliner_model.generate(input_ids, max_length=100)
57
- return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True)
58
 
59
  # Create the enhanced pipeline
60
  enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer)
@@ -74,10 +78,35 @@ def extract_education(doc):
74
  spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
75
  return list(gliner_education.union(spacy_babelscape_education))
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def main():
78
  st.title("Enhanced Resume Analyzer with GLinER Focus")
79
-
80
- api_key = st.text_input("Enter your Google Gemini API key", type="password")
81
  uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"])
82
 
83
  if uploaded_file is not None and api_key:
@@ -94,12 +123,14 @@ def main():
94
  st.error("Unsupported file format.")
95
  return
96
 
 
97
  doc = enhanced_nlp(resume_text)
98
 
99
  companies = extract_companies(doc)
100
  experience = extract_experience(doc)
101
  education = extract_education(doc)
102
-
 
103
  phone = extract_info_with_gliner(resume_text, "phone number")
104
  email = extract_info_with_gliner(resume_text, "email address")
105
  linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile")
@@ -120,4 +151,4 @@ def main():
120
  st.error(f"Error during processing: {e}")
121
 
122
  if __name__ == "__main__":
123
- main()
 
1
  import streamlit as st
 
2
  import google.generativeai as genai
3
  import fitz # PyMuPDF for PDF text extraction
4
  import spacy
5
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
6
+ from transformers import AutoModelForSeq2SeqLM
7
  from docx import Document
8
  import re
 
9
  import dateparser
10
  from datetime import datetime
11
  import os
12
 
 
 
 
 
 
13
  # Load SpaCy model
14
  nlp_spacy = spacy.load('en_core_web_sm')
15
 
 
22
  gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large")
23
  gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large")
24
 
25
+ def extract_info_with_gliner(text, info_type):
26
+ input_text = f"Extract {info_type} from: {text}"
27
+ input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids
28
+ outputs = gliner_model.generate(input_ids, max_length=100)
29
+ return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True)
30
+
31
  class EnhancedNERPipeline:
32
  def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer):
33
  self.nlp_spacy = nlp_spacy
 
36
  self.gliner_tokenizer = gliner_tokenizer
37
 
38
  def __call__(self, text):
39
+ # SpaCy processing
40
  doc = self.nlp_spacy(text)
41
+
42
+ # Babelscape NER processing
43
  ner_results = self.nlp_ner(text)
44
+
45
+ # GLinER processing
46
  gliner_companies = extract_info_with_gliner(text, "company names")
47
  gliner_experience = extract_info_with_gliner(text, "years of experience")
48
  gliner_education = extract_info_with_gliner(text, "educational institutions")
49
+
50
+ # Combine results
51
  combined_entities = doc.ents + tuple(ner_results)
52
+
53
+ # Add GLinER results as custom attributes
54
  doc._.gliner_companies = gliner_companies.split(', ')
55
  doc._.gliner_experience = gliner_experience
56
  doc._.gliner_education = gliner_education.split(', ')
57
+
58
+ # Update doc.ents with combined results for other entity types
59
  doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]]
 
60
 
61
+ return doc
 
 
 
 
62
 
63
  # Create the enhanced pipeline
64
  enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer)
 
78
  spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
79
  return list(gliner_education.union(spacy_babelscape_education))
80
 
81
+ def extract_text_from_pdf(file):
82
+ pdf = fitz.open(stream=file.read(), filetype="pdf")
83
+ text = ""
84
+ for page in pdf:
85
+ text += page.get_text()
86
+ return text
87
+
88
+ def extract_text_from_doc(file):
89
+ doc = Document(file)
90
+ return " ".join([paragraph.text for paragraph in doc.paragraphs])
91
+
92
+ def authenticate_gemini(api_key):
93
+ try:
94
+ genai.configure(api_key=api_key)
95
+ model = genai.GenerativeModel('gemini-pro')
96
+ return model
97
+ except Exception as e:
98
+ st.error(f"Authentication failed: {e}")
99
+ return None
100
+
101
+ def generate_summary(text, model):
102
+ prompt = f"Summarize the following resume:\n\n{text}\n\nProvide a brief overview of the candidate's qualifications, experience, and key skills."
103
+ response = model.generate_content(prompt)
104
+ return response.text
105
+
106
  def main():
107
  st.title("Enhanced Resume Analyzer with GLinER Focus")
108
+
109
+ api_key = os.environ.get("GOOGLE_GEMINI_API_KEY")
110
  uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"])
111
 
112
  if uploaded_file is not None and api_key:
 
123
  st.error("Unsupported file format.")
124
  return
125
 
126
+ # Process the resume text with the enhanced pipeline
127
  doc = enhanced_nlp(resume_text)
128
 
129
  companies = extract_companies(doc)
130
  experience = extract_experience(doc)
131
  education = extract_education(doc)
132
+
133
+ # Use GLinER for other extractions
134
  phone = extract_info_with_gliner(resume_text, "phone number")
135
  email = extract_info_with_gliner(resume_text, "email address")
136
  linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile")
 
151
  st.error(f"Error during processing: {e}")
152
 
153
  if __name__ == "__main__":
154
+ main()