Alastair Jepps commited on
Commit
0dd7290
·
0 Parent(s):
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +35 -0
  3. .gitignore +2 -0
  4. README.md +6 -0
  5. environment.yml +19 -0
  6. index.py +194 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ generations/*
2
+ .env
README.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ---
2
+ title: sd3_gradio
3
+ app_file: index.py
4
+ sdk: gradio
5
+ sdk_version: 4.36.1
6
+ ---
environment.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: talent_gradio
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - python-docx
7
+ - pypdf2
8
+ - python=3.11
9
+ - gradio=4.29.0
10
+ - python-dotenv
11
+ - pip
12
+ - pip:
13
+ - langchain-anthropic
14
+ - langchain
15
+ - langsmith
16
+ - langchainhub
17
+ - pdfminer.six
18
+ - pytesseract
19
+ - pdf2image
index.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ import time
5
+ import io
6
+ from dotenv import load_dotenv
7
+ from docx import Document
8
+ import PyPDF2
9
+ from langchain_anthropic import ChatAnthropic
10
+ from pdfminer.high_level import extract_text
11
+ import re
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_core.prompts import ChatPromptTemplate
14
+ from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
15
+ from langchain import hub
16
+ import pytesseract
17
+ from pdf2image import convert_from_path
18
+ import io
19
+
20
+
21
+ # Load environment variables from .env file
22
+ load_dotenv()
23
+
24
+
25
+ model = ChatAnthropic(model="claude-3-5-sonnet-20240620", api_key=os.getenv("ANTHROPIC_API_KEY"))
26
+ hub_prompt = hub.pull("talent_assistant")
27
+
28
+
29
+ def check_password(username, password):
30
+ return username == os.getenv("GRADIO_USERNAME") and password == os.getenv("GRADIO_PASSWORD")
31
+
32
+ def extract_human_message_template(chat_prompt):
33
+ for message in chat_prompt.messages:
34
+ if isinstance(message, HumanMessagePromptTemplate):
35
+ return message.prompt
36
+ return None
37
+
38
+ def clean_bullet_points(text):
39
+ # Replace standalone 'e' at the beginning of a line with a bullet point
40
+ text = re.sub(r'(?m)^e\s', '• ', text)
41
+
42
+ # Replace 'eo' at the beginning of a line with a nested bullet point
43
+ text = re.sub(r'(?m)^eo\s', ' ◦ ', text)
44
+ text = re.sub(r'(?m)^\+\s', '• ', text)
45
+ return text
46
+
47
+
48
+
49
+ def pdf_to_text_ocr(file_path):
50
+ # Convert PDF to list of images
51
+ images = convert_from_path(file_path)
52
+
53
+ # Perform OCR on each image
54
+ text = ""
55
+ for image in images:
56
+ # Perform OCR directly on the PIL Image object
57
+ page_text = pytesseract.image_to_string(image, config='--psm 6')
58
+
59
+ # Try to handle encoding
60
+ try:
61
+ page_text = page_text.encode('utf-8', errors='ignore').decode('utf-8')
62
+ except UnicodeEncodeError:
63
+ # If UTF-8 fails, try another common encoding
64
+ page_text = page_text.encode('iso-8859-1', errors='ignore').decode('iso-8859-1')
65
+
66
+ text += page_text + "\n\n" # Add extra newline between pages
67
+
68
+ # Post-processing
69
+ # Remove hyphenation
70
+ text = text.replace('-\n', '')
71
+
72
+ # Clean up whitespace while preserving line breaks
73
+ text = re.sub(r' +', ' ', text) # Replace multiple spaces with a single space
74
+ text = re.sub(r'\n{3,}', '\n\n', text) # Replace 3 or more newlines with 2
75
+
76
+ # Replace any remaining non-printable characters
77
+ text = re.sub(r'[^\x20-\x7E\n]', '', text)
78
+
79
+ text = text.strip() # Remove leading/trailing whitespace
80
+ text = clean_bullet_points(text)
81
+ return text
82
+
83
+
84
+
85
+ def process_match(*args):
86
+
87
+ global hub_prompt
88
+
89
+ prompt = extract_human_message_template(hub_prompt)
90
+ if prompt:
91
+ prompt.template = prompt.template.replace('{{CV}}', '{CV}')
92
+ prompt.template = prompt.template.replace('{{JOB_DESCRIPTION}}', '{JOB_DESCRIPTION}')
93
+
94
+ chain = prompt | model | StrOutputParser()
95
+
96
+ response = chain.invoke({"JOB_DESCRIPTION": args[1], "CV": args[0]})
97
+
98
+ return response
99
+
100
+ def pdf_to_text_miner(file_path):
101
+ # Extract text
102
+ text = extract_text(file_path)
103
+
104
+ # Post-processing
105
+ # Remove excessive newlines
106
+ text = re.sub(r'\n\s*\n', '\n\n', text)
107
+
108
+ # Ensure consistent newlines for section breaks
109
+ text = re.sub(r'([A-Z]+)(\n|.)*?:', r'\n\1:\n', text)
110
+
111
+ # Remove any leading/trailing whitespace
112
+ text = text.strip()
113
+
114
+ return text
115
+
116
+ def pdf_to_text(file_path):
117
+ text = ""
118
+ with open(file_path, "rb") as file:
119
+ reader = PyPDF2.PdfFileReader(file)
120
+ for page in range(reader.getNumPages()):
121
+ text += reader.getPage(page).extract_text() + "\n"
122
+ return text
123
+
124
+
125
+ def create_app():
126
+ with gr.Blocks() as app:
127
+ gr.Markdown("# Kingmakers Talent Prototype")
128
+
129
+ active_tab = gr.State("CV/JD Match")
130
+
131
+ def file_process(file):
132
+
133
+ if file.endswith('.pdf'):
134
+ return pdf_to_text_ocr(file)
135
+ else:
136
+ return open(file, 'r').read()
137
+
138
+ # Convert job description file to text
139
+ """
140
+ if job_description_file.name.endswith('.docx'):
141
+ job_description_text = docx_to_text(job_description_file.name)
142
+ elif job_description_file.name.endswith('.pdf'):
143
+ job_description_text = pdf_to_text(job_description_file.name)
144
+ else:
145
+ job_description_text = job_description_file.read().decode('utf-8')
146
+
147
+ if cv_file.name.endswith('.docx'):
148
+ cv_text = docx_to_text(cv_file.name)
149
+ elif cv_file.name.endswith('.pdf'):
150
+ cv_text = pdf_to_text(cv_file.name)
151
+ else:
152
+ cv_text = cv_file.read().decode('utf-8')
153
+ """
154
+
155
+ return "It worked!"
156
+
157
+ def update_active_tab(tab_name):
158
+ return tab_name
159
+
160
+ with gr.Tabs() as generation_mode_tabs:
161
+ with gr.TabItem("Generate"):
162
+ with gr.Row():
163
+ with gr.Column(scale=1):
164
+ with gr.Tabs() as mode_tabs:
165
+ with gr.TabItem("CV/JD Match") as text_to_image_tab:
166
+ jd = gr.Textbox(label="Job Description")
167
+ jd_file = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
168
+ jd_file.change(fn=file_process, inputs=jd_file,outputs=jd)
169
+
170
+ cv = gr.Textbox(label="CV")
171
+ cv_file = gr.File(label=".pdf, .doc or .txt" , file_types=[".pdf", ".doc", ".txt"])
172
+ cv_file.change(fn=file_process,inputs=cv_file,outputs=cv)
173
+
174
+ generate_btn = gr.Button("Generate")
175
+
176
+ with gr.Column(scale=1):
177
+ score = gr.Textbox(label="Score")
178
+ save_btn = gr.Button("Send to Greenhouse")
179
+
180
+
181
+ generate_btn.click(
182
+ fn=process_match,
183
+ inputs=[
184
+ cv, jd
185
+ ],
186
+ outputs=[score]
187
+ )
188
+
189
+ return app
190
+
191
+
192
+ if __name__ == "__main__":
193
+ app = create_app()
194
+ app.launch(debug=True) # auth=check_password Added share=True to create a public link