Hemasagar commited on
Commit
2dc3e0e
·
verified ·
1 Parent(s): e168eb0

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +7 -5
utils.py CHANGED
@@ -29,6 +29,7 @@ def get_pdf_text(pdf_doc):
29
  def extracted_data(pages_data):
30
  template = """Please Extract all the following values : invoice no., Description, Quantity, date,
31
  Unit price , Amount, Total, email, phone number and address from this data: {pages}
 
32
  Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
33
  """
34
  prompt_template = PromptTemplate(input_variables=["pages"], template=template)
@@ -44,7 +45,7 @@ def extracted_data(pages_data):
44
  # input={"prompt":prompt_template.format(pages=pages_data) ,
45
  # "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
46
  llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
47
- output_text=llm(prompt_template.format(pages=pages_data),temperature =0.1, top_p=0.9, repetition_penalty=1)
48
 
49
  full_response = ''
50
  for item in output_text:
@@ -55,13 +56,13 @@ def extracted_data(pages_data):
55
  return full_response
56
 
57
  #print(raw_data)
58
- # print("extracted raw data")
59
  # llm_extracted_data=extracted_data(raw_data)
60
  #print(llm_extracted_data)
61
 
62
  # iterate over files in
63
  # that user uploaded PDF files, one by one
64
- def create_docs(filename):
65
 
66
  df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
67
  'Description': pd.Series(dtype='str'),
@@ -78,14 +79,15 @@ def create_docs(filename):
78
 
79
 
80
 
81
- for filename in filename:
82
 
83
  print(filename)
84
  raw_data=get_pdf_text(filename)
85
- print(raw_data)
86
  # print("extracted raw data")
87
 
88
  llm_extracted_data=extracted_data(raw_data)
 
89
  #print(llm_extracted_data)
90
  #print("llm extracted data")
91
  #Adding items to our list - Adding data & its metadata
 
29
  def extracted_data(pages_data):
30
  template = """Please Extract all the following values : invoice no., Description, Quantity, date,
31
  Unit price , Amount, Total, email, phone number and address from this data: {pages}
32
+
33
  Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
34
  """
35
  prompt_template = PromptTemplate(input_variables=["pages"], template=template)
 
45
  # input={"prompt":prompt_template.format(pages=pages_data) ,
46
  # "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
47
  llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
48
+ output_text=llm(prompt_template.format(pages=pages_data))
49
 
50
  full_response = ''
51
  for item in output_text:
 
56
  return full_response
57
 
58
  #print(raw_data)
59
+ # print("extracted raw data")
60
  # llm_extracted_data=extracted_data(raw_data)
61
  #print(llm_extracted_data)
62
 
63
  # iterate over files in
64
  # that user uploaded PDF files, one by one
65
+ def create_docs(user_pdf_list):
66
 
67
  df = pd.DataFrame({'Invoice no.': pd.Series(dtype='str'),
68
  'Description': pd.Series(dtype='str'),
 
79
 
80
 
81
 
82
+ for filename in user_pdf_list:
83
 
84
  print(filename)
85
  raw_data=get_pdf_text(filename)
86
+ print("pdf_Data",raw_data)
87
  # print("extracted raw data")
88
 
89
  llm_extracted_data=extracted_data(raw_data)
90
+ print("llm_extracted_data",llm_extracted_data)
91
  #print(llm_extracted_data)
92
  #print("llm extracted data")
93
  #Adding items to our list - Adding data & its metadata