astro21 commited on
Commit
71843ed
·
verified ·
1 Parent(s): 13aa461

Upload 8 files

Browse files
Files changed (8) hide show
  1. Dockerfile +44 -0
  2. ResumeStructure.py +16 -0
  3. app-g.py +53 -0
  4. app.py +72 -0
  5. packages.txt +31 -0
  6. prompt_template.py +75 -0
  7. requirements.txt +12 -0
  8. utils.py +147 -0
Dockerfile ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.10
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /code
6
+
7
+ # Copy the requirements file into the container
8
+ COPY ./requirements.txt /code/requirements.txt
9
+ COPY ./packages.txt /code/packages.txt
10
+
11
+
12
+
13
+ # Install any needed dependencies
14
+ RUN apt-get update && apt-get install -y sudo $(cat packages.txt)
15
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
16
+
17
+ # Install fastapi
18
+ RUN pip install fastapi
19
+ RUN pip install "uvicorn[standard]"
20
+
21
+ # Set the TRANSFORMERS_CACHE environment variable
22
+ ENV TEMP_FILES /temp
23
+ ENV FILES /temp_files
24
+ ENV CACHE /.cache
25
+
26
+ # Create the cache directory and adjust permissions
27
+ RUN mkdir -p $TEMP_FILES \
28
+ && chmod -R 777 $TEMP_FILES
29
+
30
+ RUN mkdir -p $FILES \
31
+ && chmod -R 777 $FILES
32
+
33
+ RUN mkdir -p $CACHE \
34
+ && chmod -R 777 $CACHE
35
+
36
+
37
+ # Copy the rest of the application code into the container
38
+ COPY . .
39
+
40
+ # Expose the port that the FastAPI application will run on
41
+ EXPOSE 7860
42
+
43
+ # Command to run the FastAPI application with Gradio
44
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
ResumeStructure.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Union
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+
4
+
5
+ class ResumeStructure(BaseModel):
6
+ education: List[Dict[str, str]] = Field(description="List of dictionaries containing 'university' and 'CGPA'")
7
+ work: List[Dict[str, Union[str, List[str]]]] = Field(description="List of dictionaries containing "
8
+ "'organization', 'location', 'position', "
9
+ "'duration', 'standardized_job_title', "
10
+ "and 'predicted_skills'")
11
+ projects: List[Dict[str, Union[str, List[str]]]] = Field(description="List of dictionaries containing "
12
+ "'project_name', 'start_date', 'end_date', "
13
+ "'description', and 'predicted_skills'")
14
+ skills: Dict[str, List[str]] = Field(description="Dictionary containing all 'Technical Skills' and 'Non Technical Skills'")
15
+ techstack: List[str] = Field(description="List of all technologies used in projects")
16
+ career_trajectory: str = Field(description="String representing the career progression of the candidate")
app-g.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from langchain_community.chat_models import ChatOpenAI
3
+ from utils import process_file_with_dedoc, extract_text_from_all_levels, generate_formatted_resume, generate_json_structured_resume
4
+
5
+ ALLOWED_EXTENSIONS = {"jpg", "jpeg", "png", "docx", "pdf", "html", "doc"}
6
+
7
+ def allowed_file(filename):
8
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
9
+
10
+ def parse_resume(file_info, status):
11
+ file_path, file = file_info
12
+ filename = file_path.split("/")[-1] # Extract the file name
13
+
14
+ if not allowed_file(filename):
15
+ return "Invalid file type. Allowed file types are: jpg, jpeg, png, docx, pdf, html, doc", None, filename
16
+
17
+ status.update(f"Processing: {filename}")
18
+
19
+ # Create instances of the chat model
20
+ chat_llm_text = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
21
+ chat_llm_json = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
22
+
23
+ # Read and process the file
24
+ text = process_file_with_dedoc(file) # Ensure this is synchronous or adapted for async in Gradio
25
+ status.update(f"Extracting text from: {filename}")
26
+ text_f = extract_text_from_all_levels(text) # Ensure this is synchronous or adapted for async in Gradio
27
+
28
+ # Generate parsed resume and parsed JSON resume
29
+ status.update(f"Generating formatted resume for: {filename}")
30
+ parsed_resume = generate_formatted_resume(text_f, chat_llm_text)
31
+ status.update(f"Generating structured JSON resume for: {filename}")
32
+ parsed_json_resume = generate_json_structured_resume(text_f, chat_llm_json)
33
+
34
+ return parsed_resume, parsed_json_resume, filename
35
+
36
+ # Define the Gradio interface
37
+ demo = gr.Interface(
38
+ fn=parse_resume,
39
+ inputs=[
40
+ gr.File(label="Upload your resume"),
41
+ gr.StatusTracker()
42
+ ],
43
+ outputs=[
44
+ gr.Textbox(label="Formatted Resume"),
45
+ gr.JSON(label="Structured JSON Resume"),
46
+ gr.Textbox(label="File Name", lines=1)
47
+ ],
48
+ title="Resume Parser",
49
+ description="Upload a resume to parse it into formatted text and structured JSON."
50
+ )
51
+
52
+ if __name__ == "__main__":
53
+ demo.launch(share=True)
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from typing import List
5
+ from langchain_community.chat_models import ChatOpenAI
6
+ from utils import process_file_with_dedoc, extract_text_from_all_levels, generate_formatted_resume, \
7
+ generate_json_structured_resume
8
+ import shutil
9
+ import os
10
+
11
+ app = FastAPI()
12
+
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"],
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ ALLOWED_EXTENSIONS = {"jpg", "jpeg", "png", "docx", "pdf", "html", "doc"}
22
+
23
+
24
+ def allowed_file(filename):
25
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
26
+
27
+
28
+ @app.post("/parse_resume/")
29
+ async def parse_resume(files: List[UploadFile] = File(...)):
30
+ parsed_resumes = []
31
+
32
+ for uploaded_file in files:
33
+ if allowed_file(uploaded_file.filename):
34
+ chat_llm_text = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
35
+ chat_llm_json = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
36
+
37
+ file_path = f"{uploaded_file.filename}"
38
+ print(file_path)
39
+
40
+ # Process the uploaded file asynchronously
41
+ text = await process_file_with_dedoc(uploaded_file)
42
+ text_f = await extract_text_from_all_levels(text)
43
+
44
+ # Generate parsed resume and parsed JSON resume asynchronously
45
+ parsed_resume = generate_formatted_resume(text_f, chat_llm_text)
46
+
47
+ parsed_json_resume = None
48
+ while parsed_json_resume is None:
49
+ # Execute your code to generate parsed_json_resume
50
+ parsed_json_resume = generate_json_structured_resume(text_f, chat_llm_json)
51
+
52
+ parsed_resumes.append({
53
+ "file_name": uploaded_file.filename,
54
+ "parsed_resume": parsed_resume,
55
+ "parsed_json_resume": parsed_json_resume
56
+ })
57
+
58
+ # Delete the uploaded file after processing
59
+ # os.remove("/temp_files/"+uploaded_file.filename)
60
+ # print(f"Deleted file: {uploaded_file.filename}")
61
+
62
+ else:
63
+ return JSONResponse(status_code=400, content={
64
+ "message": "Invalid file type. Allowed file types are: jpg, jpeg, png, docx, pdf, html, doc"})
65
+
66
+ return parsed_resumes
67
+
68
+
69
+ if __name__ == "__main__":
70
+ import uvicorn
71
+
72
+ uvicorn.run(app, host="0.0.0.0", port=8000)
packages.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ binutils-dev
2
+ build-essential
3
+ ca-certificates
4
+ clang
5
+ g++
6
+ g++-multilib
7
+ gcc-multilib
8
+ libcairo2
9
+ libffi-dev
10
+ libgdk-pixbuf2.0-0
11
+ libglib2.0-dev
12
+ libjpeg-dev
13
+ libleptonica-dev
14
+ libpango-1.0-0
15
+ libpango1.0-dev
16
+ libpangocairo-1.0-0
17
+ libpng-dev
18
+ libsm6
19
+ libtesseract-dev
20
+ libtool
21
+ libxext6
22
+ make
23
+ pkg-config
24
+ poppler-utils
25
+ shared-mime-info
26
+ software-properties-common
27
+ swig
28
+ zlib1g-dev
29
+ tesseract-ocr
30
+ tesseract-ocr-rus
31
+ libgl1
prompt_template.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ template = """
2
+ For the following text, extract the following information:
3
+
4
+ Warning: Don't greet or write any introduction. Just start with the answer to the prompts. Do as per the instructions given in the prompt. If you don't know the answer, leave that part (keep blank) and move to the next part.
5
+
6
+ 1. Education: Extract the name of the all universities/colleges attended by the candidate with there CGPA.
7
+
8
+
9
+ 2. Work: Extract all organization names where he/she has worked along with the position held, and the duration of employment.
10
+ Predicted Skills : Also extract skills based on the work experience.
11
+ Standardized Job Title: Identify the standardized job title for each work experience.
12
+ Standardized Job Title: Identify the standardized job title for each work experience.Skills based on work experience
13
+
14
+ 3. Projects: Extract the details of the projects the candidate has worked on.
15
+ Predicted Skills : Also extract skills based on each project.
16
+
17
+
18
+ 4.Skills: Identify the technical and non-technical skills associated with each work experience and project.
19
+
20
+
21
+ 5.Career Trajectory: Identify the career progression of the candidate based on their work experience.
22
+
23
+ Output them in the following format:
24
+ Warning: if there is no data for any of the fields, leave it blank.
25
+
26
+ "Education: " and separate multiple entries with new line .
27
+
28
+ "Work: " Organization Name, Location, Position, Start Date - End Date 'and separate multiple entries with a comma.
29
+ "Job Title: " Identify the job title for each work experience. Clean and strip them off suffixes, prefixes and seniority.
30
+
31
+ " Predicted Skills : " and separate multiple entries with a comma for each work experience.
32
+ Note: Separate each work experience with a new line.
33
+ Warning: Don't print this text - "Organization Name, Location, Position, Start Date - End Date" as it is in the output .
34
+
35
+
36
+ "Project Name, Start Date - End Date, Project Description " and separate multiple entries with a comma and a new line for each project. (
37
+ " Predicted Skills : " and separate multiple entries with a comma for each project.
38
+ Note: Project Description should be in 30 to 40 words
39
+
40
+ Note: Separate each project with a new line.
41
+ Warning: Don't print "Project Name, Start Date - End Date, Project Description" as it is (text) in the output .
42
+
43
+ "Skills: " Skills under the skills section.
44
+ Classify them as technical and non-technical skills if possible.
45
+
46
+ "Career Trajectory: " and separate multiple entries with a -> . Career Trajectory should be in acsending order with respect to date of joining.
47
+ eg1 : "Data Analyst -> Data Scientist -> Senior Data Scientist"
48
+ eg2 : "School Name -> College Name -> University Name -> Job Title -> Job Title"
49
+
50
+ Resume: {text}
51
+
52
+ """
53
+
54
+ template_format_instructions = """
55
+ 1. Education: Extract the name of all universities/colleges attended by the candidate with their CGPA. Separate multiple entries with a new line.
56
+ 2. Work: Extract all organization names where he/she has worked along with the position held and the duration of employment.
57
+ Predicted Skills: Also extract skills based on the work experience.
58
+ Standardized Job Title: Identify the standardized job title for each work experience.
59
+ Organization Name, Location, Position, Start Date - End Date. Separate multiple entries with a comma.
60
+ "Job Title:" Identify the job title for each work experience. Clean and strip them of suffixes, prefixes, and seniority.
61
+ 3. Projects: Extract the details of the projects the candidate has worked on.
62
+ "Project Name, Start Date - End Date, Project Description". Separate multiple entries with a comma and a new line for each project.
63
+ "Predicted Skills:" Separate multiple entries with a comma for each project.
64
+ Note: Project Description should be 30 to 40 words.
65
+ Note: Separate each project with a new line.
66
+ 4. Skills: Identify the technical and non-technical skills associated with each work experience and project.
67
+ 5. Techstack: Identify the technologies used from skills and predicted skills.
68
+ 6. Career Trajectory: Identify the career progression of the candidate based on their work experience.
69
+ Separate multiple entries with a "->". Career Trajectory should be in ascending order with respect to the date of joining.
70
+ eg1: "Data Analyst -> Data Scientist -> Senior Data Scientist"
71
+ eg2: "School Name -> College Name -> University Name -> Job Title -> Job Title"
72
+ Warning: Ensure consistent extraction of skills by providing clear context and examples in the resume. Use standardized terms for skills.
73
+ Resume: {text}
74
+ \n{format_instructions}\n
75
+ """
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ torch
3
+ dedoc
4
+ streamlit
5
+ tesseract
6
+ pytesseract
7
+ langchain-openai
8
+ unstructured
9
+ unstructured[pdf]
10
+ opencv-python
11
+ faiss-cpu
12
+ langchain-community
utils.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from dedoc import DedocManager
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.prompts import PromptTemplate
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+ from ResumeStructure import ResumeStructure
8
+ from fastapi import UploadFile
9
+ from prompt_template import template_format_instructions, template
10
+ from typing import List
11
+
12
+
13
+
14
+ # Create a directory to store temporary files
15
+ TEMP_DIR = "/temp_files"
16
+ # if not os.path.exists(TEMP_DIR):
17
+ # os.makedirs(TEMP_DIR)
18
+
19
+
20
+ async def process_file_with_dedoc(file: UploadFile):
21
+ """
22
+ Process the file using Dedoc and return the output data.
23
+
24
+ Args:
25
+ - file: The UploadedFile object to be processed.
26
+
27
+ Returns:
28
+ - Output data if the file is processed successfully, None otherwise.
29
+ """
30
+ manager = DedocManager()
31
+
32
+ supported_formats = ['jpg', 'jpeg', 'png', 'docx', 'pdf', 'html', 'doc']
33
+
34
+ print(f"Processing file '{file.filename}'...")
35
+
36
+ # Save the uploaded file to a temporary directory
37
+ file_path = os.path.join(TEMP_DIR, file.filename)
38
+
39
+ with open(file_path, "wb") as buffer:
40
+ shutil.copyfileobj(file.file, buffer)
41
+
42
+ # Extract file extension from the file name
43
+ file_name, file_extension = os.path.splitext(file.filename)
44
+ file_extension = file_extension[1:].lower() # Remove the leading dot and convert to lowercase
45
+
46
+ # Check if the file extension is supported
47
+ if file_extension not in supported_formats:
48
+ print(f"Cannot process file '{file.filename}'. Unsupported file format.")
49
+ return None
50
+
51
+ # Process the file using Dedoc
52
+ output = manager.parse(file_path)
53
+ output_data = output.to_api_schema().model_dump()
54
+
55
+ # Remove the temporary file
56
+ os.remove(file_path)
57
+
58
+ return output_data
59
+
60
+
61
+ async def extract_text_from_all_levels(data):
62
+ """
63
+ Extract text from all levels of subparagraphs in the JSON data.
64
+
65
+ Args:
66
+ - data: The JSON data containing subparagraphs.
67
+
68
+ Returns:
69
+ - A string containing the text from all levels of subparagraphs.
70
+ """
71
+ text = ""
72
+
73
+ if 'subparagraphs' in data['content']['structure']:
74
+ subparagraphs = data['content']['structure']['subparagraphs']
75
+ text += await extract_text_from_subparagraphs(subparagraphs)
76
+ return text
77
+
78
+
79
+ async def extract_text_from_subparagraphs(subparagraphs):
80
+ """
81
+ Recursively extract text from subparagraphs.
82
+
83
+ Args:
84
+ - subparagraphs: A list of subparagraphs.
85
+
86
+ Returns:
87
+ - A string containing the text from all subparagraphs.
88
+ """
89
+ text = ""
90
+ for subpara in subparagraphs:
91
+ text += subpara['text'] + "\n"
92
+ if 'subparagraphs' in subpara:
93
+ text += await extract_text_from_subparagraphs(subpara['subparagraphs'])
94
+ return text
95
+
96
+
97
+ def generate_formatted_resume(resume, chat_llm):
98
+ prompt = PromptTemplate(
99
+ template=template,
100
+ input_variables=["text"],
101
+ )
102
+ chain = prompt | chat_llm
103
+
104
+ result = chain.invoke({"text": resume})
105
+
106
+ return result.content
107
+
108
+
109
+ def generate_json_structured_resume(resume, chat_llm):
110
+ parser = JsonOutputParser(pydantic_object=ResumeStructure)
111
+
112
+ prompt = PromptTemplate(
113
+ template=template_format_instructions,
114
+ input_variables=["text"],
115
+ partial_variables={"format_instructions": parser.get_format_instructions()}
116
+ )
117
+ chain = prompt | chat_llm | parser
118
+
119
+ result = chain.invoke({"text": resume})
120
+
121
+ return result
122
+
123
+
124
+ def delete_files_in_directory(directory):
125
+ """
126
+ Deletes all files in the specified directory.
127
+
128
+ Args:
129
+ directory (str): The path to the directory containing files to be deleted.
130
+
131
+ Returns:
132
+ None
133
+ """
134
+ # Check if the directory exists
135
+ if not os.path.exists(directory):
136
+ print(f"Directory '{directory}' does not exist.")
137
+ return
138
+
139
+ # Get a list of all files in the directory
140
+ files = os.listdir(directory)
141
+
142
+ # Iterate over each file and delete it
143
+ for file in files:
144
+ file_path = os.path.join(directory, file)
145
+ if os.path.isfile(file_path):
146
+ os.remove(file_path)
147
+ print(f"Deleted file: {file_path}")