PDFReader / backend.py
thisisdev's picture
Update
253659d verified
raw
history blame
2.57 kB
import os
import re
import pandas as pd
from pypdf import PdfReader
from typing import List, Dict
from langchain.prompts import PromptTemplate
from langchain_google_genai import GoogleGenerativeAI
os.environ["GOOGLE_API_KEY"] = "AIzaSyCYGj5e2eAQbUi9HtuMaW0LDSnDuxLG54U"
class InvoicePipeline:
def __init__(self, paths):
self._paths = paths
self._llm = GoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=api_key)
self._prompt_template = self._get_default_prompt_template()
# This funcition will help in extracting and run the code, and will produce a dataframe for us
def run(self) -> pd.DataFrame:
# We have defined the way the data has to be returned
df = pd.DataFrame(
"Invoice ID": pd.Series(dtype = "int"),
"DESCRIPTION": pd.Series(dtype = "str"),
"Issue Data": pd.Series(dtype = "str"),
"UNIT PRICE": pd.Series(dtype = "str"),
"AMOUNT": pd.Series(dtype = "int"),
"Bill For": pd.Series(dtype = "str"),
"From": pd.Series(dtype =" str"),
"Terms": pd.Series(dtype = "str")}
)
for path in self._paths:
raw_text = self._get_raw_text_from_pdf(path) # This function needs to be created
llm_resp = self._extract_data_from_llm(raw_text) #
data = self._parse_response(llm_resp)
df = pd.concat([df, pd.DataFrame([data])], ignore_index = True)
return df
# The default template that the machine will take
def _get_default_prompt_template(self) -> PromptTemplate:
template = """Extract all the following values: Invoice ID, DESCRIPTION, Issue Data,UNIT PRICE, AMOUNT, Bill for, From and Terms for: {pages}
Expected Outcome: remove any dollar symbols {{"Invoice ID":"12341234", "DESCRIPTION": "UNIT PRICE", "AMOUNT": "3", "Date": "2/1/2021", "AMOUNT": "100", "Bill For": "Dev", "From": "Coca Cola", "Terms" : "Net for 30 days"}}
"""
prompt_template = PromptTemplate(input_variables = ["pages"], template = template)
return prompt_template
# We will try to extract the text from the PDF to a normal variable.
def _get_raw_text_from_pdf(self, path:str) -> str:
text = ""
pdf_reader = PdfReader(path)
for page in pdf_reader:
text += page.extract_text()
return text
def _extract_data_from_llm(self, raw_data:str) -> str:
resp = self._llm(self._prompt_template.format(pages = raw_data))
return resp