Spaces:

Hemasagar
/

Pdf-to-csv-audio-to-text

Sleeping

App Files Files Community

Hemasagar commited on May 30, 2024

Commit

2dc3e0e

verified ·

1 Parent(s): e168eb0

Update utils.py

Browse files

Files changed (1) hide show

utils.py +7 -5

utils.py CHANGED Viewed

@@ -29,6 +29,7 @@ def get_pdf_text(pdf_doc):
 def extracted_data(pages_data):
     template = """Please Extract all the following values : invoice no., Description, Quantity, date,
         Unit price , Amount, Total, email, phone number and address from this data: {pages}
         Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
         """
     prompt_template = PromptTemplate(input_variables=["pages"], template=template)
@@ -44,7 +45,7 @@ def extracted_data(pages_data):
     #                     input={"prompt":prompt_template.format(pages=pages_data) ,
     #                                 "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
     llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
-    output_text=llm(prompt_template.format(pages=pages_data),temperature =0.1, top_p=0.9, repetition_penalty=1)
     full_response = ''
     for item in output_text:
@@ -55,13 +56,13 @@ def extracted_data(pages_data):
     return full_response
 #print(raw_data)
-# print("extracted raw data")
 # llm_extracted_data=extracted_data(raw_data)
         #print(llm_extracted_data)
 # iterate over files in
 # that user uploaded PDF files, one by one
-def create_docs(filename):
     df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
                    'Description': pd.Series(dtype='str'),
@@ -78,14 +79,15 @@ def create_docs(filename):
-    for filename in filename:
         print(filename)
         raw_data=get_pdf_text(filename)
-        print(raw_data)
         # print("extracted raw data")
         llm_extracted_data=extracted_data(raw_data)
         #print(llm_extracted_data)
         #print("llm extracted data")
         #Adding items to our list - Adding data & its metadata

 def extracted_data(pages_data):
     template = """Please Extract all the following values : invoice no., Description, Quantity, date,
         Unit price , Amount, Total, email, phone number and address from this data: {pages}
         Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
         """
     prompt_template = PromptTemplate(input_variables=["pages"], template=template)
     #                     input={"prompt":prompt_template.format(pages=pages_data) ,
     #                                 "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
     llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
+    output_text=llm(prompt_template.format(pages=pages_data))
     full_response = ''
     for item in output_text:
     return full_response
 #print(raw_data)
+# print("extracted raw  data")
 # llm_extracted_data=extracted_data(raw_data)
         #print(llm_extracted_data)
 # iterate over files in
 # that user uploaded PDF files, one by one
+def create_docs(user_pdf_list):
     df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
                    'Description': pd.Series(dtype='str'),
+    for filename in user_pdf_list:
         print(filename)
         raw_data=get_pdf_text(filename)
+        print("pdf_Data",raw_data)
         # print("extracted raw data")
         llm_extracted_data=extracted_data(raw_data)
+        print("llm_extracted_data",llm_extracted_data)
         #print(llm_extracted_data)
         #print("llm extracted data")
         #Adding items to our list - Adding data & its metadata