omdivyatej commited on
Commit
85c096c
·
1 Parent(s): 9d15510

requirements

Browse files
Files changed (8) hide show
  1. .env +1 -0
  2. .gitignore +5 -0
  3. ai_json.py +35 -0
  4. app.py +59 -0
  5. categories.csv +10 -0
  6. example_pg3.json +274 -0
  7. ocr_request.py +44 -0
  8. requirements.txt +0 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY = 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
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ venv
2
+ flagged
3
+ __pycache__
4
+ .gitignore
5
+ .env
ai_json.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import json
3
+ from dotenv import load_dotenv
4
+ import os
5
+
6
+
7
+ def handle_creating_json(output_1):
8
+ print("Before GPT: " , output_1)
9
+ #output_1= [{'invoice_number': '349136', 'table_data': [{'label': 'Product_Code', 'text': ''}, {'label': 'Description', 'text': '1ST FLOOR WALLS'}, {'label': 'Price', 'text': ''}, {'label': 'Line_Amount', 'text': ''}, {'label': 'Product_Code', 'text': 'CPL1216'}, {'label': 'Description', 'text': "11.875 X 16 ' Pro Lam 2.0 LVL 1.75 ( 7 @ 16 ' , 4 @\n8 ' )"}, {'label': 'Price', 'text': '139.09'}, {'label': 'Line_Amount', 'text': '1,251.81'}, {'label': 'Product_Code', 'text': 'CPL1210'}, {'label': 'Description', 'text': "COLUMN\n11.875 X 10 ' Pro Lam 2.0 LVL 1.75"}, {'label': 'Price', 'text': '87.56'}, {'label': 'Line_Amount', 'text': '525.36'}, {'label': 'Product_Code', 'text': 'CPCB35558'}, {'label': 'Description', 'text': "Power Column 3 1/2 X 5 1/2 - 08 '"}, {'label': 'Price', 'text': '82.51'}, {'label': 'Line_Amount', 'text': '330.04'}]}]
10
+ load_dotenv()
11
+ # Initialize OpenAI with your API key
12
+ openai.api_key = os.getenv("OPENAI_API_KEY")
13
+
14
+ prompt = f""" You are an excellent programmer and specialize in the construction industry, knowing everything about building a house. Given a JSON which resembles a table, you have two tasks:
15
+ 1. extract product description and determine or predict whether product descriptions is Exterior Door/Finish/ Framing/Siding/Windows or Roofing. Think well and do some self reflection. Do not share your thought process with me though.
16
+ 2. Once you have thought through, produce a json, easily convertible to a dataframe in python, which would contain invoice number, product description, predicted material, confidence ( b/w 0-1, your confidence score which shows how sure are you about your prediction)
17
+ Remember: You just have to share the output json, no thought process or extra words or anything else. If you are not able to identify the invoice number just write NA.
18
+ No apologies or regret. Always produce an output.
19
+
20
+
21
+ Here is the json: {json.dumps(output_1)}
22
+ """
23
+ messages=[{"role": "user", "content":prompt}]
24
+ # Use OpenAI to generate a completion using GPT-4 (replace 'gpt-4.0-turbo' with the correct engine ID once available)
25
+ response = openai.ChatCompletion.create(
26
+ model="gpt-4",
27
+ max_tokens=5000,
28
+ temperature=0,
29
+ messages = messages
30
+ )
31
+ # Extracting the result
32
+ result = response.choices[0]["message"]["content"]
33
+ print("After gpt")
34
+ print(json.loads(result))
35
+ return json.loads(result)
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import pandas as pd # Import pandas
4
+ from ocr_request import ocr_request
5
+ import io
6
+
7
+ def process_file(files):
8
+ response_arr = []
9
+ # Send the uploaded file to the function from ocr_request.py
10
+ for file in files:
11
+ response = ocr_request(file.name)
12
+ response_arr.append(response)
13
+
14
+ print("Main file :", response_arr)
15
+
16
+ #i= [[{'invoice_number': '349136', 'product_description': '1ST FLOOR WALLS', 'predicted_material': 'Framing', 'confidence': 0.8}, {'invoice_number': '349136', 'product_description': "11.875 X 16 ' Pro Lam 2.0 LVL 1.75 ( 7 @ 16 ' , 4 @\n8 ' )", 'predicted_material': 'Framing', 'confidence': 0.9}, {'invoice_number': '349136', 'product_description': "COLUMN\n11.875 X 10 ' Pro Lam 2.0 LVL 1.75", 'predicted_material': 'Framing', 'confidence': 0.9}, {'invoice_number': '3495565136', 'product_description': "Power Column 3 1/2 X 5 1/2 - 08 '", 'predicted_material': 'Framing', 'confidence': 0.9}],[{'invoice_number': '349136', 'product_description': ' FLOOR WALLS', 'predicted_material': 'Framing', 'confidence': 0.8}, {'invoice_number': '349136', 'product_description': "11.875 X 16 ' Pro Lam 2.0 LVL 1.75 ( 7 @ 16 ' , 4 @\n8 ' )", 'predicted_material': 'Framing', 'confidence': 0.9}, {'invoice_number': '349136', 'product_description': "COLUMN\n11.875 X 10 ' Pro Lam 2.0 LVL 1.75", 'predicted_material': 'Framing', 'confidence': 0.9}, {'invoice_number': '349136', 'product_description': "Power Column 3 1/2 X 5 1/2 - 08 '", 'predicted_material': 'Framing', 'confidence': 0.9}]]
17
+ flat_list = []
18
+
19
+ for item in response_arr:
20
+ invoice_number = item['invoice_number']
21
+
22
+ # Extracting product descriptions
23
+ products = item.get('predictions', []) or item.get('product_description', [])
24
+
25
+ for product in products:
26
+ # Rename 'description' key to 'product_description' for uniformity across all products
27
+ product_description = product.get('product_description', product.get('description'))
28
+ predicted_material = product['predicted_material']
29
+ confidence = product['confidence']
30
+
31
+ flat_list.append({
32
+ 'invoice_number': invoice_number,
33
+ 'product_description': product_description,
34
+ 'predicted_material': predicted_material,
35
+ 'confidence': confidence
36
+ })
37
+
38
+ df = pd.DataFrame(flat_list)
39
+
40
+ print("Df final : ", df)
41
+ # Save the dataframe to a CSV in-memory
42
+
43
+ result_csv = df.to_csv(index=False)
44
+
45
+ csv_filename = "categories.csv"
46
+ with open(csv_filename, "w") as f:
47
+ f.write(result_csv)
48
+
49
+ return df,csv_filename # Gradio will display this as a table
50
+
51
+
52
+
53
+ interface = gr.Interface(fn=process_file,
54
+ inputs=gr.inputs.File(label="Upload a File", file_count='multiple'),
55
+ outputs=["dataframe",gr.outputs.File(label="Download CSV")]) # Specify "dataframe" as output type
56
+
57
+ interface.launch(share=True)
58
+
59
+
categories.csv ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ invoice_number,product_description,predicted_material,confidence
2
+ 349136,1ST FLOOR WALLS,Framing,0.8
3
+ 349136,"11.875 X 16 ' Pro Lam 2.0 LVL 1.75 ( 7 @ 16 ' , 4 @
4
+ 8 ' )",Framing,0.9
5
+ 349136,"COLUMN
6
+ 11.875 X 10 ' Pro Lam 2.0 LVL 1.75",Framing,0.9
7
+ 349136,Power Column 3 1/2 X 5 1/2 - 08 ',Framing,0.9
8
+ 351500,QUOTE # 4323800,NA,0.0
9
+ 351500,11.875 X 18 ' Pro Lam 2.0 LVL 1.75,Framing,0.9
10
+ 351500,11.875 X 22 ' Pro Lam 2.0 LVL 1.75,Framing,0.9
example_pg3.json ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "message": "Success",
3
+ "result": [
4
+ {
5
+ "message": "Success",
6
+ "input": "page-3.pdf",
7
+ "prediction": [
8
+ {
9
+ "id": "d263e93d-be6b-4642-b246-5bb8a90b8ba2",
10
+ "label": "table",
11
+ "xmin": 216,
12
+ "ymin": 1137,
13
+ "xmax": 2445,
14
+ "ymax": 1594,
15
+ "score": 1,
16
+ "ocr_text": "table",
17
+ "type": "table",
18
+ "cells": [],
19
+ "status": "correctly_predicted",
20
+ "page_no": 0,
21
+ "label_id": ""
22
+ },
23
+ {
24
+ "id": "57ef19ae-73b9-4e60-92af-5d15bbe13e32",
25
+ "label": "buyer_address",
26
+ "xmin": 100,
27
+ "ymin": 582,
28
+ "xmax": 491,
29
+ "ymax": 663,
30
+ "score": 0.766381,
31
+ "ocr_text": "174 DORCHESTER ST\nBOSTON , MA , 02127",
32
+ "type": "field",
33
+ "status": "correctly_predicted",
34
+ "page_no": 0,
35
+ "label_id": "12808214-6c3c-4c6f-bcff-01ce5a7596c6"
36
+ },
37
+ {
38
+ "id": "615d6c14-60fe-4f7c-9e84-9c5cbbc2649e",
39
+ "label": "subtotal",
40
+ "xmin": 2304,
41
+ "ymin": 2611,
42
+ "xmax": 2416,
43
+ "ymax": 2638,
44
+ "score": 0.48327795,
45
+ "ocr_text": "2,107.21",
46
+ "type": "field",
47
+ "status": "correctly_predicted",
48
+ "page_no": 0,
49
+ "label_id": "0ea778c5-d7e9-46cc-b212-577f099babd8"
50
+ },
51
+ {
52
+ "id": "fb669362-edbe-4f49-a4f9-126f7fe255d6",
53
+ "label": "total_tax",
54
+ "xmin": 2313,
55
+ "ymin": 2707,
56
+ "xmax": 2419,
57
+ "ymax": 2732,
58
+ "score": 0.8847264,
59
+ "ocr_text": "$ 131.70",
60
+ "type": "field",
61
+ "status": "correctly_predicted",
62
+ "page_no": 0,
63
+ "label_id": "cc9cf54f-e3f6-4898-b82b-3a10002bfdc6"
64
+ },
65
+ {
66
+ "id": "aa4c1de7-7c31-4b21-bd8a-8930adcf6cac",
67
+ "label": "invoice_date",
68
+ "xmin": 2299,
69
+ "ymin": 307,
70
+ "xmax": 2446,
71
+ "ymax": 329,
72
+ "score": 0.9948618,
73
+ "ocr_text": "04/12/2023",
74
+ "type": "field",
75
+ "status": "correctly_predicted",
76
+ "page_no": 0,
77
+ "label_id": "ded9e5da-118c-48dd-bae1-97cb95df0976"
78
+ },
79
+ {
80
+ "id": "d99687df-518a-4e7a-bbb3-38fcb4a192f7",
81
+ "label": "net_d",
82
+ "xmin": 2323,
83
+ "ymin": 358,
84
+ "xmax": 2444,
85
+ "ymax": 379,
86
+ "score": 0.47389212,
87
+ "ocr_text": "2nd EOM",
88
+ "type": "field",
89
+ "status": "correctly_predicted",
90
+ "page_no": 0,
91
+ "label_id": "790b65b6-25ef-41d7-a8ad-a93a36d02dd4"
92
+ },
93
+ {
94
+ "id": "6e73864b-4d34-4e49-9f4f-a010b2b771a7",
95
+ "label": "shipto_address",
96
+ "xmin": 850,
97
+ "ymin": 485,
98
+ "xmax": 1228,
99
+ "ymax": 562,
100
+ "score": 0.42755184,
101
+ "ocr_text": "BENNINGTON ST\n490 BENNINGTON ST",
102
+ "type": "field",
103
+ "status": "correctly_predicted",
104
+ "page_no": 0,
105
+ "label_id": "fc485f02-f2f1-4975-9096-579508235896"
106
+ },
107
+ {
108
+ "id": "3b148335-3c00-4559-8e45-901f66700ebe",
109
+ "label": "shipto_address",
110
+ "xmin": 1004,
111
+ "ymin": 581,
112
+ "xmax": 1013,
113
+ "ymax": 613,
114
+ "score": 0.4113207,
115
+ "ocr_text": ",",
116
+ "type": "field",
117
+ "status": "correctly_predicted",
118
+ "page_no": 0,
119
+ "label_id": "fc485f02-f2f1-4975-9096-579508235896"
120
+ },
121
+ {
122
+ "id": "2d7f8055-8a9d-435e-9c68-6bc86deb44bd",
123
+ "label": "currency",
124
+ "xmin": 0,
125
+ "ymin": 0,
126
+ "xmax": 0,
127
+ "ymax": 0,
128
+ "score": 1,
129
+ "ocr_text": "USD",
130
+ "type": "field",
131
+ "status": "correctly_predicted",
132
+ "page_no": 0,
133
+ "label_id": "7f166b2d-ed55-47eb-bc4b-510121dd1772"
134
+ },
135
+ {
136
+ "id": "e11052f5-eb47-473c-914b-ced66972a18c",
137
+ "label": "seller_name",
138
+ "xmin": 951,
139
+ "ymin": 145,
140
+ "xmax": 1420,
141
+ "ymax": 183,
142
+ "score": 0.97490394,
143
+ "ocr_text": "Dartmouth Building Supply",
144
+ "type": "field",
145
+ "status": "correctly_predicted",
146
+ "page_no": 0,
147
+ "label_id": "47b8254f-2bdf-4834-89d2-8c4e824f5351"
148
+ },
149
+ {
150
+ "id": "4b4b2b95-50ac-4ca2-bba4-2366b6cd7d15",
151
+ "label": "invoice_number",
152
+ "xmin": 2252,
153
+ "ymin": 231,
154
+ "xmax": 2446,
155
+ "ymax": 274,
156
+ "score": 0.99667007,
157
+ "ocr_text": "349136",
158
+ "type": "field",
159
+ "status": "correctly_predicted",
160
+ "page_no": 0,
161
+ "label_id": "59b94f5f-ac8f-4cb6-8c54-41cb14528f58"
162
+ },
163
+ {
164
+ "id": "0a9f5423-bfd7-4f0a-a904-fd87fe98d8ad",
165
+ "label": "payment_due_date",
166
+ "xmin": 461,
167
+ "ymin": 2612,
168
+ "xmax": 620,
169
+ "ymax": 2635,
170
+ "score": 0.98759484,
171
+ "ocr_text": "06/30/2023",
172
+ "type": "field",
173
+ "status": "correctly_predicted",
174
+ "page_no": 0,
175
+ "label_id": "cd67bda9-5ec4-44b6-a93f-67319f4b9105"
176
+ },
177
+ {
178
+ "id": "14c59f5a-86bc-420b-95b6-e199c9c4a7e1",
179
+ "label": "buyer_name",
180
+ "xmin": 100,
181
+ "ymin": 485,
182
+ "xmax": 460,
183
+ "ymax": 561,
184
+ "score": 0.95362574,
185
+ "ocr_text": "Elevated Construction\nLLC",
186
+ "type": "field",
187
+ "status": "correctly_predicted",
188
+ "page_no": 0,
189
+ "label_id": "59ca9238-c882-4108-842b-3ac09ea1ec32"
190
+ },
191
+ {
192
+ "id": "ab0bff8d-8d25-42b1-8d45-ada931b131f1",
193
+ "label": "invoice_amount",
194
+ "xmin": 2288,
195
+ "ymin": 2614,
196
+ "xmax": 2303,
197
+ "ymax": 2638,
198
+ "score": 0.42010704,
199
+ "ocr_text": "$",
200
+ "type": "field",
201
+ "status": "correctly_predicted",
202
+ "page_no": 0,
203
+ "label_id": "39a22a11-370c-401c-bf6e-b4f4fb14bdea"
204
+ },
205
+ {
206
+ "id": "ffa71a20-d784-48bb-9f5e-0af113c22375",
207
+ "label": "invoice_amount",
208
+ "xmin": 2289,
209
+ "ymin": 2808,
210
+ "xmax": 2416,
211
+ "ymax": 2833,
212
+ "score": 0.80036026,
213
+ "ocr_text": "$ 2,238.91",
214
+ "type": "field",
215
+ "status": "correctly_predicted",
216
+ "page_no": 0,
217
+ "label_id": "39a22a11-370c-401c-bf6e-b4f4fb14bdea"
218
+ },
219
+ {
220
+ "id": "e3c8f3db-b8d1-426e-a0f6-3e793a52da25",
221
+ "label": "seller_address",
222
+ "xmin": 949,
223
+ "ymin": 211,
224
+ "xmax": 1425,
225
+ "ymax": 288,
226
+ "score": 0.72980326,
227
+ "ocr_text": "958 Reed Road\nNorth Dartmouth , MA 02747",
228
+ "type": "field",
229
+ "status": "correctly_predicted",
230
+ "page_no": 0,
231
+ "label_id": "22eda91b-4613-4122-afbe-e1db97103f83"
232
+ },
233
+ {
234
+ "id": "7c41042b-42a4-4e6b-83ea-ecb3aab57f87",
235
+ "label": "seller_phone",
236
+ "xmin": 949,
237
+ "ymin": 308,
238
+ "xmax": 1181,
239
+ "ymax": 334,
240
+ "score": 0.6113706,
241
+ "ocr_text": "508-990-2389",
242
+ "type": "field",
243
+ "status": "correctly_predicted",
244
+ "page_no": 0,
245
+ "label_id": "5e5d4507-bdab-4d74-b215-b1abf1834caf"
246
+ },
247
+ {
248
+ "id": "b1f5e03d-8d9a-4637-b261-01017200f3ab",
249
+ "label": "client_id",
250
+ "xmin": 2383,
251
+ "ymin": 408,
252
+ "xmax": 2449,
253
+ "ymax": 429,
254
+ "score": 0.81289464,
255
+ "ocr_text": "6529",
256
+ "type": "field",
257
+ "status": "correctly_predicted",
258
+ "page_no": 0,
259
+ "label_id": "913a5f4d-30f0-41fa-80fd-fe5cd49c8660"
260
+ }
261
+ ],
262
+ "page": 0,
263
+ "request_file_id": "6f6d4af0-9b10-4df4-96ab-d2d60273d9bc",
264
+ "filepath": "uploadedfiles/99a96f48-fa67-461d-a17d-8475af701b17/PredictionImages/8f144b01-7377-4fd9-8cd3-474ce8fa5ec1-1.jpeg",
265
+ "id": "12d51dd1-6155-11ee-b209-f2f6095ec3b8",
266
+ "rotation": 0,
267
+ "file_url": "uploadedfiles/99a96f48-fa67-461d-a17d-8475af701b17/RawPredictions/6f6d4af0-9b10-4df4-96ab-d2d60273d9bc.pdf",
268
+ "request_metadata": "",
269
+ "processing_type": "sync",
270
+ "size": { "width": 2550, "height": 3300 }
271
+ }
272
+ ]
273
+
274
+ }
ocr_request.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # t.py
2
+ import requests
3
+ import openai
4
+ import json
5
+ import ai_json
6
+
7
+
8
+ def ocr_request(file_path):
9
+ url = 'https://app.nanonets.com/api/v2/OCR/Model/99a96f48-fa67-461d-a17d-8475af701b17/LabelFile/?async=false'
10
+ data = {'file': open(file_path, 'rb')}
11
+ response = requests.post(url, auth=requests.auth.HTTPBasicAuth('12ac2745-5e44-11ee-bb98-ea6b2bf28c31', ''), files=data)
12
+ response = response.json()
13
+ response_data = response["result"][0]["prediction"]
14
+
15
+ for element in response_data:
16
+ if element['label'] == 'table':
17
+ table_data = element['cells']
18
+ elif element['label'] == 'invoice_number':
19
+ invoice_number = element['ocr_text']
20
+
21
+ output_1 = {
22
+ 'invoice_number': invoice_number,
23
+ 'table_data':table_data
24
+ }
25
+
26
+ keys_to_remove = [
27
+ 'xmin', 'ymin', 'xmax', 'ymax', 'id', 'label_id', 'verification_status',
28
+ 'failed_validation', 'status', 'score', 'row_label', 'col_span',
29
+ 'row_span', 'row', 'col']
30
+
31
+ for item in output_1["table_data"]:
32
+ for key in keys_to_remove:
33
+ item.pop(key, None)
34
+
35
+ print("Before sending to gpt", output_1)
36
+ gpt_response = ai_json.handle_creating_json(output_1)
37
+
38
+ return gpt_response
39
+
40
+
41
+
42
+
43
+
44
+
requirements.txt ADDED
Binary file (2.44 kB). View file