Alastair Jepps
commited on
Commit
·
0dd7290
0
Parent(s):
Initial
Browse files- .DS_Store +0 -0
- .gitattributes +35 -0
- .gitignore +2 -0
- README.md +6 -0
- environment.yml +19 -0
- index.py +194 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
generations/*
|
2 |
+
.env
|
README.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: sd3_gradio
|
3 |
+
app_file: index.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 4.36.1
|
6 |
+
---
|
environment.yml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: talent_gradio
|
2 |
+
channels:
|
3 |
+
- conda-forge
|
4 |
+
- defaults
|
5 |
+
dependencies:
|
6 |
+
- python-docx
|
7 |
+
- pypdf2
|
8 |
+
- python=3.11
|
9 |
+
- gradio=4.29.0
|
10 |
+
- python-dotenv
|
11 |
+
- pip
|
12 |
+
- pip:
|
13 |
+
- langchain-anthropic
|
14 |
+
- langchain
|
15 |
+
- langsmith
|
16 |
+
- langchainhub
|
17 |
+
- pdfminer.six
|
18 |
+
- pytesseract
|
19 |
+
- pdf2image
|
index.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import time
|
5 |
+
import io
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from docx import Document
|
8 |
+
import PyPDF2
|
9 |
+
from langchain_anthropic import ChatAnthropic
|
10 |
+
from pdfminer.high_level import extract_text
|
11 |
+
import re
|
12 |
+
from langchain_core.output_parsers import StrOutputParser
|
13 |
+
from langchain_core.prompts import ChatPromptTemplate
|
14 |
+
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
|
15 |
+
from langchain import hub
|
16 |
+
import pytesseract
|
17 |
+
from pdf2image import convert_from_path
|
18 |
+
import io
|
19 |
+
|
20 |
+
|
21 |
+
# Load environment variables from .env file
|
22 |
+
load_dotenv()
|
23 |
+
|
24 |
+
|
25 |
+
model = ChatAnthropic(model="claude-3-5-sonnet-20240620", api_key=os.getenv("ANTHROPIC_API_KEY"))
|
26 |
+
hub_prompt = hub.pull("talent_assistant")
|
27 |
+
|
28 |
+
|
29 |
+
def check_password(username, password):
|
30 |
+
return username == os.getenv("GRADIO_USERNAME") and password == os.getenv("GRADIO_PASSWORD")
|
31 |
+
|
32 |
+
def extract_human_message_template(chat_prompt):
|
33 |
+
for message in chat_prompt.messages:
|
34 |
+
if isinstance(message, HumanMessagePromptTemplate):
|
35 |
+
return message.prompt
|
36 |
+
return None
|
37 |
+
|
38 |
+
def clean_bullet_points(text):
|
39 |
+
# Replace standalone 'e' at the beginning of a line with a bullet point
|
40 |
+
text = re.sub(r'(?m)^e\s', '• ', text)
|
41 |
+
|
42 |
+
# Replace 'eo' at the beginning of a line with a nested bullet point
|
43 |
+
text = re.sub(r'(?m)^eo\s', ' ◦ ', text)
|
44 |
+
text = re.sub(r'(?m)^\+\s', '• ', text)
|
45 |
+
return text
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
def pdf_to_text_ocr(file_path):
|
50 |
+
# Convert PDF to list of images
|
51 |
+
images = convert_from_path(file_path)
|
52 |
+
|
53 |
+
# Perform OCR on each image
|
54 |
+
text = ""
|
55 |
+
for image in images:
|
56 |
+
# Perform OCR directly on the PIL Image object
|
57 |
+
page_text = pytesseract.image_to_string(image, config='--psm 6')
|
58 |
+
|
59 |
+
# Try to handle encoding
|
60 |
+
try:
|
61 |
+
page_text = page_text.encode('utf-8', errors='ignore').decode('utf-8')
|
62 |
+
except UnicodeEncodeError:
|
63 |
+
# If UTF-8 fails, try another common encoding
|
64 |
+
page_text = page_text.encode('iso-8859-1', errors='ignore').decode('iso-8859-1')
|
65 |
+
|
66 |
+
text += page_text + "\n\n" # Add extra newline between pages
|
67 |
+
|
68 |
+
# Post-processing
|
69 |
+
# Remove hyphenation
|
70 |
+
text = text.replace('-\n', '')
|
71 |
+
|
72 |
+
# Clean up whitespace while preserving line breaks
|
73 |
+
text = re.sub(r' +', ' ', text) # Replace multiple spaces with a single space
|
74 |
+
text = re.sub(r'\n{3,}', '\n\n', text) # Replace 3 or more newlines with 2
|
75 |
+
|
76 |
+
# Replace any remaining non-printable characters
|
77 |
+
text = re.sub(r'[^\x20-\x7E\n]', '', text)
|
78 |
+
|
79 |
+
text = text.strip() # Remove leading/trailing whitespace
|
80 |
+
text = clean_bullet_points(text)
|
81 |
+
return text
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
def process_match(*args):
|
86 |
+
|
87 |
+
global hub_prompt
|
88 |
+
|
89 |
+
prompt = extract_human_message_template(hub_prompt)
|
90 |
+
if prompt:
|
91 |
+
prompt.template = prompt.template.replace('{{CV}}', '{CV}')
|
92 |
+
prompt.template = prompt.template.replace('{{JOB_DESCRIPTION}}', '{JOB_DESCRIPTION}')
|
93 |
+
|
94 |
+
chain = prompt | model | StrOutputParser()
|
95 |
+
|
96 |
+
response = chain.invoke({"JOB_DESCRIPTION": args[1], "CV": args[0]})
|
97 |
+
|
98 |
+
return response
|
99 |
+
|
100 |
+
def pdf_to_text_miner(file_path):
|
101 |
+
# Extract text
|
102 |
+
text = extract_text(file_path)
|
103 |
+
|
104 |
+
# Post-processing
|
105 |
+
# Remove excessive newlines
|
106 |
+
text = re.sub(r'\n\s*\n', '\n\n', text)
|
107 |
+
|
108 |
+
# Ensure consistent newlines for section breaks
|
109 |
+
text = re.sub(r'([A-Z]+)(\n|.)*?:', r'\n\1:\n', text)
|
110 |
+
|
111 |
+
# Remove any leading/trailing whitespace
|
112 |
+
text = text.strip()
|
113 |
+
|
114 |
+
return text
|
115 |
+
|
116 |
+
def pdf_to_text(file_path):
|
117 |
+
text = ""
|
118 |
+
with open(file_path, "rb") as file:
|
119 |
+
reader = PyPDF2.PdfFileReader(file)
|
120 |
+
for page in range(reader.getNumPages()):
|
121 |
+
text += reader.getPage(page).extract_text() + "\n"
|
122 |
+
return text
|
123 |
+
|
124 |
+
|
125 |
+
def create_app():
|
126 |
+
with gr.Blocks() as app:
|
127 |
+
gr.Markdown("# Kingmakers Talent Prototype")
|
128 |
+
|
129 |
+
active_tab = gr.State("CV/JD Match")
|
130 |
+
|
131 |
+
def file_process(file):
|
132 |
+
|
133 |
+
if file.endswith('.pdf'):
|
134 |
+
return pdf_to_text_ocr(file)
|
135 |
+
else:
|
136 |
+
return open(file, 'r').read()
|
137 |
+
|
138 |
+
# Convert job description file to text
|
139 |
+
"""
|
140 |
+
if job_description_file.name.endswith('.docx'):
|
141 |
+
job_description_text = docx_to_text(job_description_file.name)
|
142 |
+
elif job_description_file.name.endswith('.pdf'):
|
143 |
+
job_description_text = pdf_to_text(job_description_file.name)
|
144 |
+
else:
|
145 |
+
job_description_text = job_description_file.read().decode('utf-8')
|
146 |
+
|
147 |
+
if cv_file.name.endswith('.docx'):
|
148 |
+
cv_text = docx_to_text(cv_file.name)
|
149 |
+
elif cv_file.name.endswith('.pdf'):
|
150 |
+
cv_text = pdf_to_text(cv_file.name)
|
151 |
+
else:
|
152 |
+
cv_text = cv_file.read().decode('utf-8')
|
153 |
+
"""
|
154 |
+
|
155 |
+
return "It worked!"
|
156 |
+
|
157 |
+
def update_active_tab(tab_name):
|
158 |
+
return tab_name
|
159 |
+
|
160 |
+
with gr.Tabs() as generation_mode_tabs:
|
161 |
+
with gr.TabItem("Generate"):
|
162 |
+
with gr.Row():
|
163 |
+
with gr.Column(scale=1):
|
164 |
+
with gr.Tabs() as mode_tabs:
|
165 |
+
with gr.TabItem("CV/JD Match") as text_to_image_tab:
|
166 |
+
jd = gr.Textbox(label="Job Description")
|
167 |
+
jd_file = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
|
168 |
+
jd_file.change(fn=file_process, inputs=jd_file,outputs=jd)
|
169 |
+
|
170 |
+
cv = gr.Textbox(label="CV")
|
171 |
+
cv_file = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
|
172 |
+
cv_file.change(fn=file_process,inputs=cv_file,outputs=cv)
|
173 |
+
|
174 |
+
generate_btn = gr.Button("Generate")
|
175 |
+
|
176 |
+
with gr.Column(scale=1):
|
177 |
+
score = gr.Textbox(label="Score")
|
178 |
+
save_btn = gr.Button("Send to Greenhouse")
|
179 |
+
|
180 |
+
|
181 |
+
generate_btn.click(
|
182 |
+
fn=process_match,
|
183 |
+
inputs=[
|
184 |
+
cv, jd
|
185 |
+
],
|
186 |
+
outputs=[score]
|
187 |
+
)
|
188 |
+
|
189 |
+
return app
|
190 |
+
|
191 |
+
|
192 |
+
if __name__ == "__main__":
|
193 |
+
app = create_app()
|
194 |
+
app.launch(debug=True) # auth=check_password Added share=True to create a public link
|