Spaces:

astro21
/

resume-revealer-g

Build error

App Files Files Community

astro21 commited on Apr 26, 2024

Commit

71843ed

verified ·

1 Parent(s): 13aa461

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +44 -0
ResumeStructure.py +16 -0
app-g.py +53 -0
app.py +72 -0
packages.txt +31 -0
prompt_template.py +75 -0
requirements.txt +12 -0
utils.py +147 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+# Use an official Python runtime as a parent image
+FROM python:3.10
+# Set the working directory in the container
+WORKDIR /code
+# Copy the requirements file into the container
+COPY ./requirements.txt /code/requirements.txt
+COPY ./packages.txt /code/packages.txt
+# Install any needed dependencies
+RUN apt-get update && apt-get install -y sudo $(cat packages.txt)
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Install fastapi
+RUN pip install fastapi
+RUN pip install "uvicorn[standard]"
+# Set the TRANSFORMERS_CACHE environment variable
+ENV TEMP_FILES /temp
+ENV FILES /temp_files
+ENV CACHE /.cache
+# Create the cache directory and adjust permissions
+RUN mkdir -p $TEMP_FILES \
+    && chmod -R 777 $TEMP_FILES
+RUN mkdir -p $FILES \
+    && chmod -R 777 $FILES
+RUN mkdir -p $CACHE \
+    && chmod -R 777 $CACHE
+# Copy the rest of the application code into the container
+COPY . .
+# Expose the port that the FastAPI application will run on
+EXPOSE 7860
+# Command to run the FastAPI application with Gradio
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

ResumeStructure.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from typing import List, Dict, Union
+from langchain_core.pydantic_v1 import BaseModel, Field
+class ResumeStructure(BaseModel):
+    education: List[Dict[str, str]] = Field(description="List of dictionaries containing 'university' and 'CGPA'")
+    work: List[Dict[str, Union[str, List[str]]]] = Field(description="List of dictionaries containing "
+                                                                     "'organization', 'location', 'position', "
+                                                                     "'duration', 'standardized_job_title', "
+                                                                     "and 'predicted_skills'")
+    projects: List[Dict[str, Union[str, List[str]]]] = Field(description="List of dictionaries containing "
+                                                                         "'project_name', 'start_date', 'end_date', "
+                                                                         "'description', and 'predicted_skills'")
+    skills: Dict[str, List[str]] = Field(description="Dictionary containing all  'Technical Skills' and 'Non Technical Skills'")
+    techstack: List[str] = Field(description="List of all technologies used in projects")
+    career_trajectory: str = Field(description="String representing the career progression of the candidate")

app-g.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import gradio as gr
+from langchain_community.chat_models import ChatOpenAI
+from utils import process_file_with_dedoc, extract_text_from_all_levels, generate_formatted_resume, generate_json_structured_resume
+ALLOWED_EXTENSIONS = {"jpg", "jpeg", "png", "docx", "pdf", "html", "doc"}
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def parse_resume(file_info, status):
+    file_path, file = file_info
+    filename = file_path.split("/")[-1]  # Extract the file name
+    if not allowed_file(filename):
+        return "Invalid file type. Allowed file types are: jpg, jpeg, png, docx, pdf, html, doc", None, filename
+    status.update(f"Processing: {filename}")
+    # Create instances of the chat model
+    chat_llm_text = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
+    chat_llm_json = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
+    # Read and process the file
+    text = process_file_with_dedoc(file)  # Ensure this is synchronous or adapted for async in Gradio
+    status.update(f"Extracting text from: {filename}")
+    text_f = extract_text_from_all_levels(text)  # Ensure this is synchronous or adapted for async in Gradio
+    # Generate parsed resume and parsed JSON resume
+    status.update(f"Generating formatted resume for: {filename}")
+    parsed_resume = generate_formatted_resume(text_f, chat_llm_text)
+    status.update(f"Generating structured JSON resume for: {filename}")
+    parsed_json_resume = generate_json_structured_resume(text_f, chat_llm_json)
+    return parsed_resume, parsed_json_resume, filename
+# Define the Gradio interface
+demo = gr.Interface(
+    fn=parse_resume,
+    inputs=[
+        gr.File(label="Upload your resume"),
+        gr.StatusTracker()
+    ],
+    outputs=[
+        gr.Textbox(label="Formatted Resume"),
+        gr.JSON(label="Structured JSON Resume"),
+        gr.Textbox(label="File Name", lines=1)
+    ],
+    title="Resume Parser",
+    description="Upload a resume to parse it into formatted text and structured JSON."
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List
+from langchain_community.chat_models import ChatOpenAI
+from utils import process_file_with_dedoc, extract_text_from_all_levels, generate_formatted_resume, \
+    generate_json_structured_resume
+import shutil
+import os
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+ALLOWED_EXTENSIONS = {"jpg", "jpeg", "png", "docx", "pdf", "html", "doc"}
+def allowed_file(filename):
+    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+@app.post("/parse_resume/")
+async def parse_resume(files: List[UploadFile] = File(...)):
+    parsed_resumes = []
+    for uploaded_file in files:
+        if allowed_file(uploaded_file.filename):
+            chat_llm_text = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
+            chat_llm_json = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.0)
+            file_path = f"{uploaded_file.filename}"
+            print(file_path)
+            # Process the uploaded file asynchronously
+            text = await process_file_with_dedoc(uploaded_file)
+            text_f = await extract_text_from_all_levels(text)
+            # Generate parsed resume and parsed JSON resume asynchronously
+            parsed_resume = generate_formatted_resume(text_f, chat_llm_text)
+            parsed_json_resume = None
+            while parsed_json_resume is None:
+                # Execute your code to generate parsed_json_resume
+                parsed_json_resume = generate_json_structured_resume(text_f, chat_llm_json)
+            parsed_resumes.append({
+                "file_name": uploaded_file.filename,
+                "parsed_resume": parsed_resume,
+                "parsed_json_resume": parsed_json_resume
+            })
+            # Delete the uploaded file after processing
+            # os.remove("/temp_files/"+uploaded_file.filename)
+            # print(f"Deleted file: {uploaded_file.filename}")
+        else:
+            return JSONResponse(status_code=400, content={
+                "message": "Invalid file type. Allowed file types are: jpg, jpeg, png, docx, pdf, html, doc"})
+    return parsed_resumes
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

packages.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+binutils-dev
+build-essential
+ca-certificates
+clang
+g++
+g++-multilib
+gcc-multilib
+libcairo2
+libffi-dev
+libgdk-pixbuf2.0-0
+libglib2.0-dev
+libjpeg-dev
+libleptonica-dev
+libpango-1.0-0
+libpango1.0-dev
+libpangocairo-1.0-0
+libpng-dev
+libsm6
+libtesseract-dev
+libtool
+libxext6
+make
+pkg-config
+poppler-utils
+shared-mime-info
+software-properties-common
+swig
+zlib1g-dev
+tesseract-ocr
+tesseract-ocr-rus
+libgl1

prompt_template.py ADDED Viewed

	@@ -0,0 +1,75 @@

+template = """
+For the following text, extract the following information:
+Warning: Don't greet or write any introduction. Just start with the answer to the prompts. Do as per the instructions given in the prompt. If you don't know the answer, leave that part (keep blank) and move to the next part.
+1. Education: Extract the name of the all universities/colleges attended by the candidate with there CGPA.
+2. Work: Extract all organization names where he/she has worked along with the position held, and the duration of employment.
+            Predicted Skills : Also extract skills based on the work experience.
+            Standardized Job Title: Identify the standardized job title for each work experience.
+            Standardized Job Title: Identify the standardized job title for each work experience.Skills based on work experience
+3. Projects: Extract the details of the projects the candidate has worked on.
+                Predicted Skills : Also extract skills based on each project.
+4.Skills: Identify the technical and non-technical skills associated with each work experience and project.
+5.Career Trajectory: Identify the career progression of the candidate based on their work experience.
+Output them in the following format:
+Warning: if there is no data for any of the fields, leave it blank.
+        "Education: " and separate multiple entries with new line .
+        "Work: " Organization Name, Location, Position, Start Date - End Date 'and separate multiple entries with a comma.
+            "Job Title: " Identify the  job title for each work experience. Clean and strip them off suffixes, prefixes and seniority.
+            " Predicted Skills : " and separate multiple entries with a comma for each work experience.
+        Note: Separate each work experience with a new line.
+        Warning: Don't print this text - "Organization Name, Location, Position, Start Date - End Date" as it is in the output  .
+        "Project Name, Start Date - End Date, Project Description " and separate multiple entries with a comma and a new line for each project. (
+            " Predicted Skills : " and separate multiple entries with a comma for each project.
+            Note:  Project Description should be in 30 to 40 words
+        Note: Separate each project with a new line.
+        Warning: Don't print "Project Name, Start Date - End Date, Project Description"  as it is (text)  in the output .
+        "Skills: " Skills under the skills section.
+                    Classify them as technical and non-technical skills if possible.
+        "Career Trajectory: " and separate multiple entries with a -> . Career Trajectory should be in acsending order with respect to date of joining.
+                eg1 : "Data Analyst -> Data Scientist -> Senior Data Scientist"
+                eg2 : "School Name -> College Name -> University Name -> Job Title -> Job Title"
+Resume: {text}
+"""
+template_format_instructions = """
+1. Education: Extract the name of all universities/colleges attended by the candidate with their CGPA. Separate multiple entries with a new line.
+2. Work: Extract all organization names where he/she has worked along with the position held and the duration of employment.
+    Predicted Skills: Also extract skills based on the work experience.
+    Standardized Job Title: Identify the standardized job title for each work experience.
+    Organization Name, Location, Position, Start Date - End Date. Separate multiple entries with a comma.
+    "Job Title:" Identify the job title for each work experience. Clean and strip them of suffixes, prefixes, and seniority.
+3. Projects: Extract the details of the projects the candidate has worked on.
+    "Project Name, Start Date - End Date, Project Description". Separate multiple entries with a comma and a new line for each project.
+    "Predicted Skills:" Separate multiple entries with a comma for each project.
+    Note: Project Description should be 30 to 40 words.
+    Note: Separate each project with a new line.
+4. Skills: Identify the technical and non-technical skills associated with each work experience and project.
+5. Techstack: Identify the technologies used from skills and predicted skills.
+6. Career Trajectory: Identify the career progression of the candidate based on their work experience.
+    Separate multiple entries with a "->". Career Trajectory should be in ascending order with respect to the date of joining.
+    eg1: "Data Analyst -> Data Scientist -> Senior Data Scientist"
+    eg2: "School Name -> College Name -> University Name -> Job Title -> Job Title"
+Warning: Ensure consistent extraction of skills by providing clear context and examples in the resume. Use standardized terms for skills.
+Resume: {text}
+\n{format_instructions}\n
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+langchain
+torch
+dedoc
+streamlit
+tesseract
+pytesseract
+langchain-openai
+unstructured
+unstructured[pdf]
+opencv-python
+faiss-cpu
+langchain-community

utils.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import os
+import shutil
+from dedoc import DedocManager
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import JsonOutputParser
+from ResumeStructure import ResumeStructure
+from fastapi import UploadFile
+from prompt_template import template_format_instructions, template
+from typing import List
+# Create a directory to store temporary files
+TEMP_DIR = "/temp_files"
+# if not os.path.exists(TEMP_DIR):
+#     os.makedirs(TEMP_DIR)
+async def process_file_with_dedoc(file: UploadFile):
+    """
+    Process the file using Dedoc and return the output data.
+    Args:
+    - file: The UploadedFile object to be processed.
+    Returns:
+    - Output data if the file is processed successfully, None otherwise.
+    """
+    manager = DedocManager()
+    supported_formats = ['jpg', 'jpeg', 'png', 'docx', 'pdf', 'html', 'doc']
+    print(f"Processing file '{file.filename}'...")
+    # Save the uploaded file to a temporary directory
+    file_path = os.path.join(TEMP_DIR, file.filename)
+    with open(file_path, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    # Extract file extension from the file name
+    file_name, file_extension = os.path.splitext(file.filename)
+    file_extension = file_extension[1:].lower()  # Remove the leading dot and convert to lowercase
+    # Check if the file extension is supported
+    if file_extension not in supported_formats:
+        print(f"Cannot process file '{file.filename}'. Unsupported file format.")
+        return None
+    # Process the file using Dedoc
+    output = manager.parse(file_path)
+    output_data = output.to_api_schema().model_dump()
+    # Remove the temporary file
+    os.remove(file_path)
+    return output_data
+async def extract_text_from_all_levels(data):
+    """
+    Extract text from all levels of subparagraphs in the JSON data.
+    Args:
+    - data: The JSON data containing subparagraphs.
+    Returns:
+    - A string containing the text from all levels of subparagraphs.
+    """
+    text = ""
+    if 'subparagraphs' in data['content']['structure']:
+        subparagraphs = data['content']['structure']['subparagraphs']
+        text += await extract_text_from_subparagraphs(subparagraphs)
+    return text
+async def extract_text_from_subparagraphs(subparagraphs):
+    """
+    Recursively extract text from subparagraphs.
+    Args:
+    - subparagraphs: A list of subparagraphs.
+    Returns:
+    - A string containing the text from all subparagraphs.
+    """
+    text = ""
+    for subpara in subparagraphs:
+        text += subpara['text'] + "\n"
+        if 'subparagraphs' in subpara:
+            text += await extract_text_from_subparagraphs(subpara['subparagraphs'])
+    return text
+def generate_formatted_resume(resume, chat_llm):
+    prompt = PromptTemplate(
+        template=template,
+        input_variables=["text"],
+    )
+    chain = prompt | chat_llm
+    result = chain.invoke({"text": resume})
+    return result.content
+def generate_json_structured_resume(resume, chat_llm):
+    parser = JsonOutputParser(pydantic_object=ResumeStructure)
+    prompt = PromptTemplate(
+        template=template_format_instructions,
+        input_variables=["text"],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+    chain = prompt | chat_llm | parser
+    result = chain.invoke({"text": resume})
+    return result
+def delete_files_in_directory(directory):
+    """
+    Deletes all files in the specified directory.
+    Args:
+    directory (str): The path to the directory containing files to be deleted.
+    Returns:
+    None
+    """
+    # Check if the directory exists
+    if not os.path.exists(directory):
+        print(f"Directory '{directory}' does not exist.")
+        return
+    # Get a list of all files in the directory
+    files = os.listdir(directory)
+    # Iterate over each file and delete it
+    for file in files:
+        file_path = os.path.join(directory, file)
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+            print(f"Deleted file: {file_path}")