Alastair Jepps
commited on
Commit
·
2ac1ad7
1
Parent(s):
d12863a
type hinting
Browse files- environment.yml +0 -2
- index.py +33 -57
environment.yml
CHANGED
@@ -4,7 +4,6 @@ channels:
|
|
4 |
- defaults
|
5 |
dependencies:
|
6 |
- python-docx
|
7 |
-
- pypdf2
|
8 |
- python=3.11
|
9 |
- gradio=4.29.0
|
10 |
- python-dotenv
|
@@ -14,6 +13,5 @@ dependencies:
|
|
14 |
- langchain
|
15 |
- langsmith
|
16 |
- langchainhub
|
17 |
-
- pdfminer.six
|
18 |
- pytesseract
|
19 |
- pdf2image
|
|
|
4 |
- defaults
|
5 |
dependencies:
|
6 |
- python-docx
|
|
|
7 |
- python=3.11
|
8 |
- gradio=4.29.0
|
9 |
- python-dotenv
|
|
|
13 |
- langchain
|
14 |
- langsmith
|
15 |
- langchainhub
|
|
|
16 |
- pytesseract
|
17 |
- pdf2image
|
index.py
CHANGED
@@ -3,19 +3,17 @@ import os
|
|
3 |
import json
|
4 |
import time
|
5 |
import io
|
|
|
6 |
from dotenv import load_dotenv
|
7 |
from docx import Document
|
8 |
-
import PyPDF2
|
9 |
from langchain_anthropic import ChatAnthropic
|
10 |
from pdfminer.high_level import extract_text
|
11 |
import re
|
12 |
from langchain_core.output_parsers import StrOutputParser
|
13 |
-
from langchain_core.prompts import ChatPromptTemplate
|
14 |
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
15 |
from langchain import hub
|
16 |
import pytesseract
|
17 |
from pdf2image import convert_from_path
|
18 |
-
import io
|
19 |
import logging
|
20 |
|
21 |
# Set up logging
|
@@ -25,17 +23,17 @@ logging.basicConfig(level=logging.ERROR)
|
|
25 |
load_dotenv()
|
26 |
|
27 |
try:
|
28 |
-
model = ChatAnthropic(model="claude-3-5-sonnet-20240620",
|
29 |
-
hub_prompt = hub.pull("talent_assistant")
|
30 |
except Exception as e:
|
31 |
logging.error(f"Error initializing ChatAnthropic or pulling hub prompt: {str(e)}")
|
32 |
model = None
|
33 |
hub_prompt = None
|
34 |
|
35 |
-
def check_password(username, password):
|
36 |
return username == os.getenv("GRADIO_USERNAME") and password == os.getenv("GRADIO_PASSWORD")
|
37 |
|
38 |
-
def extract_human_message_template(chat_prompt):
|
39 |
try:
|
40 |
for message in chat_prompt.messages:
|
41 |
if isinstance(message, HumanMessagePromptTemplate):
|
@@ -44,7 +42,8 @@ def extract_human_message_template(chat_prompt):
|
|
44 |
logging.error(f"Error extracting human message template: {str(e)}")
|
45 |
return None
|
46 |
|
47 |
-
|
|
|
48 |
try:
|
49 |
text = re.sub(r'(?m)^e\s', '• ', text)
|
50 |
text = re.sub(r'(?m)^eo\s', ' ◦ ', text)
|
@@ -53,12 +52,12 @@ def clean_bullet_points(text):
|
|
53 |
logging.error(f"Error cleaning bullet points: {str(e)}")
|
54 |
return text
|
55 |
|
56 |
-
def pdf_to_text_ocr(file_path):
|
57 |
try:
|
58 |
-
images = convert_from_path(file_path)
|
59 |
-
text = ""
|
60 |
for image in images:
|
61 |
-
page_text = pytesseract.image_to_string(image, config='--psm 6')
|
62 |
try:
|
63 |
page_text = page_text.encode('utf-8', errors='ignore').decode('utf-8')
|
64 |
except UnicodeEncodeError:
|
@@ -76,64 +75,41 @@ def pdf_to_text_ocr(file_path):
|
|
76 |
text = ""
|
77 |
return text
|
78 |
|
79 |
-
def process_questions(*args):
|
80 |
return "hubba hubba hubba"
|
81 |
|
82 |
-
def process_match(*args):
|
83 |
try:
|
84 |
global hub_prompt
|
85 |
-
prompt = extract_human_message_template(hub_prompt)
|
86 |
if prompt:
|
87 |
prompt.template = prompt.template.replace('{{CV}}', '{CV}')
|
88 |
prompt.template = prompt.template.replace('{{JOB_DESCRIPTION}}', '{JOB_DESCRIPTION}')
|
89 |
|
90 |
chain = prompt | model | StrOutputParser()
|
91 |
-
response = chain.invoke({"JOB_DESCRIPTION": args[1], "CV": args[0]})
|
92 |
except Exception as e:
|
93 |
logging.error(f"Error in process_match: {str(e)}")
|
94 |
response = "An error occurred while processing the match."
|
95 |
return response
|
96 |
|
97 |
-
def
|
98 |
-
try:
|
99 |
-
text = extract_text(file_path)
|
100 |
-
text = re.sub(r'\n\s*\n', '\n\n', text)
|
101 |
-
text = re.sub(r'([A-Z]+)(\n|.)*?:', r'\n\1:\n', text)
|
102 |
-
text = text.strip()
|
103 |
-
except Exception as e:
|
104 |
-
logging.error(f"Error in pdf_to_text_miner: {str(e)}")
|
105 |
-
text = ""
|
106 |
-
return text
|
107 |
-
|
108 |
-
def pdf_to_text(file_path):
|
109 |
-
try:
|
110 |
-
text = ""
|
111 |
-
with open(file_path, "rb") as file:
|
112 |
-
reader = PyPDF2.PdfFileReader(file)
|
113 |
-
for page in range(reader.getNumPages()):
|
114 |
-
text += reader.getPage(page).extract_text() + "\n"
|
115 |
-
except Exception as e:
|
116 |
-
logging.error(f"Error in pdf_to_text: {str(e)}")
|
117 |
-
text = ""
|
118 |
-
return text
|
119 |
-
|
120 |
-
def wrapper_function(cv, jd):
|
121 |
try:
|
122 |
-
score = process_match(cv, jd)
|
123 |
-
questions = process_questions(cv, jd)
|
124 |
except Exception as e:
|
125 |
logging.error(f"Error in wrapper_function: {str(e)}")
|
126 |
score = "An error occurred while processing the match."
|
127 |
questions = "An error occurred while generating questions."
|
128 |
return score, questions
|
129 |
|
130 |
-
def create_app():
|
131 |
with gr.Blocks() as app:
|
132 |
gr.Markdown("# Kingmakers Talent Prototype")
|
133 |
|
134 |
-
active_tab = gr.State("CV/JD Match")
|
135 |
|
136 |
-
def file_process(file):
|
137 |
try:
|
138 |
if file.endswith('.pdf'):
|
139 |
return pdf_to_text_ocr(file)
|
@@ -143,7 +119,7 @@ def create_app():
|
|
143 |
logging.error(f"Error in file_process: {str(e)}")
|
144 |
return "An error occurred while processing the file."
|
145 |
|
146 |
-
def update_active_tab(tab_name):
|
147 |
return tab_name
|
148 |
|
149 |
with gr.Tabs() as generation_mode_tabs:
|
@@ -152,20 +128,20 @@ def create_app():
|
|
152 |
with gr.Column(scale=1):
|
153 |
with gr.Tabs() as mode_tabs:
|
154 |
with gr.TabItem("CV/JD Match") as text_to_image_tab:
|
155 |
-
jd = gr.Textbox(label="Job Description")
|
156 |
-
jd_file = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
|
157 |
-
jd_file.change(fn=file_process, inputs=jd_file,outputs=jd)
|
158 |
|
159 |
-
cv = gr.Textbox(label="CV")
|
160 |
-
cv_file = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
|
161 |
-
cv_file.change(fn=file_process,inputs=cv_file,outputs=cv)
|
162 |
|
163 |
-
generate_btn = gr.Button("Generate")
|
164 |
|
165 |
with gr.Column(scale=1):
|
166 |
-
score = gr.Textbox(label="Score")
|
167 |
-
questions = gr.Textbox(label="Questions")
|
168 |
-
save_btn = gr.Button("Send to Greenhouse")
|
169 |
|
170 |
generate_btn.click(
|
171 |
fn=wrapper_function,
|
@@ -177,7 +153,7 @@ def create_app():
|
|
177 |
|
178 |
if __name__ == "__main__":
|
179 |
try:
|
180 |
-
app = create_app()
|
181 |
app.launch(debug=True) # auth=check_password Added share=True to create a public link
|
182 |
except Exception as e:
|
183 |
logging.error(f"Error launching the app: {str(e)}")
|
|
|
3 |
import json
|
4 |
import time
|
5 |
import io
|
6 |
+
from typing import Tuple, Optional, List
|
7 |
from dotenv import load_dotenv
|
8 |
from docx import Document
|
|
|
9 |
from langchain_anthropic import ChatAnthropic
|
10 |
from pdfminer.high_level import extract_text
|
11 |
import re
|
12 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
13 |
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
14 |
from langchain import hub
|
15 |
import pytesseract
|
16 |
from pdf2image import convert_from_path
|
|
|
17 |
import logging
|
18 |
|
19 |
# Set up logging
|
|
|
23 |
load_dotenv()
|
24 |
|
25 |
try:
|
26 |
+
model: Optional[ChatAnthropic] = ChatAnthropic(model="claude-3-5-sonnet-20240620", api_key=os.getenv("ANTHROPIC_API_KEY"))
|
27 |
+
hub_prompt: Optional[ChatPromptTemplate] = hub.pull("talent_assistant")
|
28 |
except Exception as e:
|
29 |
logging.error(f"Error initializing ChatAnthropic or pulling hub prompt: {str(e)}")
|
30 |
model = None
|
31 |
hub_prompt = None
|
32 |
|
33 |
+
def check_password(username: str, password: str) -> bool:
|
34 |
return username == os.getenv("GRADIO_USERNAME") and password == os.getenv("GRADIO_PASSWORD")
|
35 |
|
36 |
+
def extract_human_message_template(chat_prompt: ChatPromptTemplate) -> Optional[HumanMessagePromptTemplate]:
|
37 |
try:
|
38 |
for message in chat_prompt.messages:
|
39 |
if isinstance(message, HumanMessagePromptTemplate):
|
|
|
42 |
logging.error(f"Error extracting human message template: {str(e)}")
|
43 |
return None
|
44 |
|
45 |
+
|
46 |
+
def clean_bullet_points(text: str) -> str:
|
47 |
try:
|
48 |
text = re.sub(r'(?m)^e\s', '• ', text)
|
49 |
text = re.sub(r'(?m)^eo\s', ' ◦ ', text)
|
|
|
52 |
logging.error(f"Error cleaning bullet points: {str(e)}")
|
53 |
return text
|
54 |
|
55 |
+
def pdf_to_text_ocr(file_path: str) -> str:
|
56 |
try:
|
57 |
+
images: List[Image.Image] = convert_from_path(file_path)
|
58 |
+
text: str = ""
|
59 |
for image in images:
|
60 |
+
page_text: str = pytesseract.image_to_string(image, config='--psm 6')
|
61 |
try:
|
62 |
page_text = page_text.encode('utf-8', errors='ignore').decode('utf-8')
|
63 |
except UnicodeEncodeError:
|
|
|
75 |
text = ""
|
76 |
return text
|
77 |
|
78 |
+
def process_questions(*args: str) -> str:
|
79 |
return "hubba hubba hubba"
|
80 |
|
81 |
+
def process_match(*args: str) -> str:
|
82 |
try:
|
83 |
global hub_prompt
|
84 |
+
prompt: Optional[HumanMessagePromptTemplate] = extract_human_message_template(hub_prompt)
|
85 |
if prompt:
|
86 |
prompt.template = prompt.template.replace('{{CV}}', '{CV}')
|
87 |
prompt.template = prompt.template.replace('{{JOB_DESCRIPTION}}', '{JOB_DESCRIPTION}')
|
88 |
|
89 |
chain = prompt | model | StrOutputParser()
|
90 |
+
response: str = chain.invoke({"JOB_DESCRIPTION": args[1], "CV": args[0]})
|
91 |
except Exception as e:
|
92 |
logging.error(f"Error in process_match: {str(e)}")
|
93 |
response = "An error occurred while processing the match."
|
94 |
return response
|
95 |
|
96 |
+
def wrapper_function(cv: str, jd: str) -> Tuple[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
try:
|
98 |
+
score: str = process_match(cv, jd)
|
99 |
+
questions: str = process_questions(cv, jd)
|
100 |
except Exception as e:
|
101 |
logging.error(f"Error in wrapper_function: {str(e)}")
|
102 |
score = "An error occurred while processing the match."
|
103 |
questions = "An error occurred while generating questions."
|
104 |
return score, questions
|
105 |
|
106 |
+
def create_app() -> gr.Blocks:
|
107 |
with gr.Blocks() as app:
|
108 |
gr.Markdown("# Kingmakers Talent Prototype")
|
109 |
|
110 |
+
active_tab: gr.State = gr.State("CV/JD Match")
|
111 |
|
112 |
+
def file_process(file: str) -> str:
|
113 |
try:
|
114 |
if file.endswith('.pdf'):
|
115 |
return pdf_to_text_ocr(file)
|
|
|
119 |
logging.error(f"Error in file_process: {str(e)}")
|
120 |
return "An error occurred while processing the file."
|
121 |
|
122 |
+
def update_active_tab(tab_name: str) -> str:
|
123 |
return tab_name
|
124 |
|
125 |
with gr.Tabs() as generation_mode_tabs:
|
|
|
128 |
with gr.Column(scale=1):
|
129 |
with gr.Tabs() as mode_tabs:
|
130 |
with gr.TabItem("CV/JD Match") as text_to_image_tab:
|
131 |
+
jd: gr.Textbox = gr.Textbox(label="Job Description")
|
132 |
+
jd_file: gr.File = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
|
133 |
+
jd_file.change(fn=file_process, inputs=jd_file, outputs=jd)
|
134 |
|
135 |
+
cv: gr.Textbox = gr.Textbox(label="CV")
|
136 |
+
cv_file: gr.File = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
|
137 |
+
cv_file.change(fn=file_process, inputs=cv_file, outputs=cv)
|
138 |
|
139 |
+
generate_btn: gr.Button = gr.Button("Generate")
|
140 |
|
141 |
with gr.Column(scale=1):
|
142 |
+
score: gr.Textbox = gr.Textbox(label="Score")
|
143 |
+
questions: gr.Textbox = gr.Textbox(label="Questions")
|
144 |
+
save_btn: gr.Button = gr.Button("Send to Greenhouse")
|
145 |
|
146 |
generate_btn.click(
|
147 |
fn=wrapper_function,
|
|
|
153 |
|
154 |
if __name__ == "__main__":
|
155 |
try:
|
156 |
+
app: gr.Blocks = create_app()
|
157 |
app.launch(debug=True) # auth=check_password Added share=True to create a public link
|
158 |
except Exception as e:
|
159 |
logging.error(f"Error launching the app: {str(e)}")
|