Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -29,6 +29,7 @@ def get_pdf_text(pdf_doc):
|
|
29 |
def extracted_data(pages_data):
|
30 |
template = """Please Extract all the following values : invoice no., Description, Quantity, date,
|
31 |
Unit price , Amount, Total, email, phone number and address from this data: {pages}
|
|
|
32 |
Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
|
33 |
"""
|
34 |
prompt_template = PromptTemplate(input_variables=["pages"], template=template)
|
@@ -44,7 +45,7 @@ def extracted_data(pages_data):
|
|
44 |
# input={"prompt":prompt_template.format(pages=pages_data) ,
|
45 |
# "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
|
46 |
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
|
47 |
-
output_text=llm(prompt_template.format(pages=pages_data)
|
48 |
|
49 |
full_response = ''
|
50 |
for item in output_text:
|
@@ -55,13 +56,13 @@ def extracted_data(pages_data):
|
|
55 |
return full_response
|
56 |
|
57 |
#print(raw_data)
|
58 |
-
# print("extracted raw
|
59 |
# llm_extracted_data=extracted_data(raw_data)
|
60 |
#print(llm_extracted_data)
|
61 |
|
62 |
# iterate over files in
|
63 |
# that user uploaded PDF files, one by one
|
64 |
-
def create_docs(
|
65 |
|
66 |
df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
|
67 |
'Description': pd.Series(dtype='str'),
|
@@ -78,14 +79,15 @@ def create_docs(filename):
|
|
78 |
|
79 |
|
80 |
|
81 |
-
for filename in
|
82 |
|
83 |
print(filename)
|
84 |
raw_data=get_pdf_text(filename)
|
85 |
-
print(raw_data)
|
86 |
# print("extracted raw data")
|
87 |
|
88 |
llm_extracted_data=extracted_data(raw_data)
|
|
|
89 |
#print(llm_extracted_data)
|
90 |
#print("llm extracted data")
|
91 |
#Adding items to our list - Adding data & its metadata
|
|
|
29 |
def extracted_data(pages_data):
|
30 |
template = """Please Extract all the following values : invoice no., Description, Quantity, date,
|
31 |
Unit price , Amount, Total, email, phone number and address from this data: {pages}
|
32 |
+
|
33 |
Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
|
34 |
"""
|
35 |
prompt_template = PromptTemplate(input_variables=["pages"], template=template)
|
|
|
45 |
# input={"prompt":prompt_template.format(pages=pages_data) ,
|
46 |
# "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
|
47 |
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
|
48 |
+
output_text=llm(prompt_template.format(pages=pages_data))
|
49 |
|
50 |
full_response = ''
|
51 |
for item in output_text:
|
|
|
56 |
return full_response
|
57 |
|
58 |
#print(raw_data)
|
59 |
+
# print("extracted raw data")
|
60 |
# llm_extracted_data=extracted_data(raw_data)
|
61 |
#print(llm_extracted_data)
|
62 |
|
63 |
# iterate over files in
|
64 |
# that user uploaded PDF files, one by one
|
65 |
+
def create_docs(user_pdf_list):
|
66 |
|
67 |
df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
|
68 |
'Description': pd.Series(dtype='str'),
|
|
|
79 |
|
80 |
|
81 |
|
82 |
+
for filename in user_pdf_list:
|
83 |
|
84 |
print(filename)
|
85 |
raw_data=get_pdf_text(filename)
|
86 |
+
print("pdf_Data",raw_data)
|
87 |
# print("extracted raw data")
|
88 |
|
89 |
llm_extracted_data=extracted_data(raw_data)
|
90 |
+
print("llm_extracted_data",llm_extracted_data)
|
91 |
#print(llm_extracted_data)
|
92 |
#print("llm extracted data")
|
93 |
#Adding items to our list - Adding data & its metadata
|