gosign commited on
Commit
f19b84d
·
verified ·
1 Parent(s): 378c2e9

Update analyzePdf.py

Browse files
Files changed (1) hide show
  1. analyzePdf.py +239 -110
analyzePdf.py CHANGED
@@ -1,124 +1,253 @@
1
- from flask import Flask, request, jsonify
2
- import requests
3
- import time
4
  import json
5
- import supabase
 
 
 
 
 
 
 
 
6
  import logging
7
 
8
  # Configure logging
9
  logging.basicConfig(level=logging.INFO)
10
 
11
- # Azure Document Intelligence setup
12
- AZURE_ENDPOINT = "https://gosignpdf.cognitiveservices.azure.com/"
13
- AZURE_KEY = "2nUifMPmbS35qkiFr5OjgzDw7ooE5Piw5892GQgyWZHe0oNRIBJHJQQJ99AKACfhMk5XJ3w3AAALACOGkANC"
14
-
15
- # Supabase setup
16
- SUPABASE_URL = "https://dtzuqtvroalrjhgdcowq.supabase.co/"
17
- SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImR0enVxdHZyb2FscmpoZ2Rjb3dxIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjU0NDk3MzIsImV4cCI6MjA0MTAyNTczMn0.WrIvwEOq4CqCb8IkU8G4jiWkf9DM1JxGd2_aTN4vlV4"
18
- supabase_client = supabase.create_client(SUPABASE_URL, SUPABASE_KEY)
19
 
20
  app = Flask(__name__)
21
 
22
- def log_debug(message, **kwargs):
23
- """Log debug messages for tracking."""
24
- print(f"[DEBUG] {message}")
25
- if kwargs:
26
- for key, value in kwargs.items():
27
- print(f" - {key}: {value}")
28
-
29
- def download_file_from_supabase(file_path):
30
- """Download file from Supabase storage."""
31
- log_debug("Downloading file from Supabase", file_path=file_path)
32
- response = supabase_client.storage.from_("files").download(file_path)
33
- log_debug("Supabase download response", status_code=response.status_code, text=response.text)
34
- if response.status_code != 200:
35
- raise Exception(f"Failed to download file from Supabase: {response.text}")
36
- return response.content
37
-
38
- def analyze_pdf_layout(file_content):
39
- """Send PDF to Azure and get layout data."""
40
- log_debug("Sending PDF to Azure for analysis")
41
- url = f"{AZURE_ENDPOINT}/formrecognizer/documentModels/prebuilt-layout:analyze?api-version=2023-07-31"
42
- headers = {
43
- "Ocp-Apim-Subscription-Key": AZURE_KEY,
44
- "Content-Type": "application/pdf",
45
- }
46
-
47
- response = requests.post(url, headers=headers, data=file_content)
48
- log_debug("Azure response", status_code=response.status_code, headers=response.headers)
49
- if response.status_code != 202:
50
- raise Exception(f"Azure request failed: {response.text}")
51
-
52
- operation_location = response.headers.get("Operation-Location")
53
- log_debug("Azure operation location", operation_location=operation_location)
54
- if not operation_location:
55
- raise Exception("Operation-Location header not found in response.")
56
-
57
- while True:
58
- result_response = requests.get(operation_location, headers={"Ocp-Apim-Subscription-Key": AZURE_KEY})
59
- result = result_response.json()
60
- log_debug("Azure polling result", status=result.get("status"))
61
-
62
- if result.get("status") == "succeeded":
63
- log_debug("Azure analysis succeeded")
64
- return result["analyzeResult"]
65
- elif result.get("status") == "failed":
66
- raise Exception("Analysis failed.")
67
- time.sleep(8)
68
-
69
- @app.route("/analyze", methods=["POST"])
70
- def analyze():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  try:
72
- # Get file ID from request
73
- file_id = request.json.get("file_id")
74
- log_debug("Received API request", file_id=file_id)
75
- if not file_id:
76
- return jsonify({"error": "File ID is required"}), 400
77
-
78
- # Fetch file path from Supabase
79
- file_data = supabase_client.table("files").select("filePath").eq("id", file_id).single().execute()
80
- log_debug("Supabase file data response", status_code=file_data.status_code, data=file_data.data)
81
- if file_data.status_code != 200 or not file_data.data:
82
- return jsonify({"error": "File not found"}), 404
83
-
84
- file_path = file_data.data["filePath"]
85
- log_debug("File path retrieved from Supabase", file_path=file_path)
86
-
87
- # Download the file from Supabase
88
- file_content = download_file_from_supabase(file_path)
89
-
90
- # Analyze the PDF layout with Azure
91
- layout_data = analyze_pdf_layout(file_content)
92
- log_debug("Layout data retrieved", layout_data=layout_data)
93
-
94
- # Extract required layout values
95
- page_data = layout_data.get("pages", [])[0] # Assuming single-page PDF for simplicity
96
- first_word = page_data.get("words", [])[0]
97
- last_word = page_data.get("words", [])[-1]
98
-
99
- page_height = page_data["height"]
100
- page_width = page_data["width"]
101
- x1 = first_word["polygon"][0] # X1 of first word
102
- y4 = last_word["polygon"][-1] # Y4 of last word
103
- log_debug("Extracted layout values", page_height=page_height, page_width=page_width, x1=x1, y4=y4)
104
-
105
- # Update the `files` table in Supabase
106
- update_response = supabase_client.table("files").update({
107
- "page_height": page_height,
108
- "page_width": page_width,
109
- "x1": x1,
110
- "y4": y4,
111
- }).eq("id", file_id).execute()
112
- log_debug("Supabase update response", status_code=update_response.status_code, data=update_response.data)
113
-
114
- if update_response.status_code != 200:
115
- return jsonify({"error": "Failed to update file layout data"}), 500
116
-
117
- return jsonify({"message": "Layout data successfully updated"}), 200
118
 
 
 
 
 
 
119
  except Exception as e:
120
- log_debug("Error occurred", error=str(e))
121
  return jsonify({"error": str(e)}), 500
122
 
123
- if __name__ == "__main__":
124
- app.run(host="0.0.0.0", port=8000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import os
3
+ import magic
4
+ from dotenv import load_dotenv
5
+ from docx import Document
6
+ from docx.shared import Inches
7
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
8
+ from flask_cors import CORS
9
+ from flask import Flask, request, jsonify
10
+ from supabase import create_client, Client
11
  import logging
12
 
13
  # Configure logging
14
  logging.basicConfig(level=logging.INFO)
15
 
16
+ # load_dotenv(dotenv_path='.env.local')
17
+ load_dotenv()
 
 
 
 
 
 
18
 
19
  app = Flask(__name__)
20
 
21
+ CORS(app)
22
+
23
+ url: str = 'https://dtzuqtvroalrjhgdcowq.supabase.co/'
24
+ key: str = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImR0enVxdHZyb2FscmpoZ2Rjb3dxIiwicm9sZSI6ImFub24iLCJpYXQiOjE3MjU0NDk3MzIsImV4cCI6MjA0MTAyNTczMn0.WrIvwEOq4CqCb8IkU8G4jiWkf9DM1JxGd2_aTN4vlV4'
25
+
26
+ supabase: Client = create_client(url, key)
27
+
28
+ def get_file_by_id(file_id):
29
+ try:
30
+ response = supabase.table("files").select("*").eq("id", file_id).single().execute()
31
+ file = response.data
32
+
33
+ if not file:
34
+ raise ValueError(response.error.message if response.error else "File not found.")
35
+
36
+ file_path = file.get("file_path")
37
+ file_name = file.get("name")
38
+
39
+ if not file_path:
40
+ raise ValueError("File path is missing in the metadata.")
41
+
42
+ # Fetch the actual file content from Supabase storage
43
+ file_data = supabase.storage.from_('files').download(file_path)
44
+
45
+ return file_name, file_data
46
+ except Exception as e:
47
+ print("Error fetching file:", e)
48
+ return jsonify({"error": str(e)}), 500
49
+
50
+ def get_file_type(file_path):
51
+ try:
52
+ # Use python-magic to detect the MIME type of the file
53
+ mime = magic.Magic(mime=True)
54
+ file_type = mime.from_file(file_path)
55
+ return file_type
56
+ except Exception as e:
57
+ print("Error fetching file:", e)
58
+ return jsonify({"error": str(e)}), 500
59
+
60
+ def insert_file_record(user_id, doc):
61
+ try:
62
+ file_type = get_file_type(doc)
63
+ file_record = {
64
+ "user_id": user_id,
65
+ "description": "",
66
+ "file_path": "",
67
+ "name": "letterhead-" + os.path.basename(doc),
68
+ "size": os.path.getsize(doc),
69
+ "tokens": 0,
70
+ "type": file_type,
71
+ }
72
+
73
+ response = supabase.table("files").insert(file_record).execute()
74
+
75
+ return response
76
+ except Exception as e:
77
+ print("Error fetching file:", e)
78
+ return jsonify({"error": str(e)}), 500
79
+
80
+ def upload_file_to_storage(file, metadata):
81
+ # Replace with the actual upload implementation
82
+ file_path = f"{metadata['user_id']}/{metadata['file_id']}"
83
+ # file_content = file.read() # Read the file content as bytes
84
+
85
+ file_type = get_file_type(file)
86
  try:
87
+ with open(file, 'rb') as f:
88
+ response = supabase.storage.from_("files").upload(
89
+ file=f,
90
+ path=file_path,
91
+ file_options={"cache-control": "3600", "content-type": file_type, "upsert": "false"},
92
+ )
93
+
94
+ file_path = response.path
95
+ return file_path
96
+ except Exception as e:
97
+ print("Error uploading file:", e)
98
+ return jsonify({"error": str(e)}), 500
99
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+
102
+ def update_file_record(file_id, updates):
103
+ try:
104
+ response = supabase.table("files").update(updates).eq("id", file_id).execute()
105
+ return response
106
  except Exception as e:
107
+ print("Error while updating record:", e)
108
  return jsonify({"error": str(e)}), 500
109
 
110
+ def insert_text_and_image_at_end(full_path, full_image_path, text_to_insert, include_signature, signature_position, letterhead_address):
111
+ try:
112
+ doc = Document(full_path)
113
+
114
+ # Replace placeholder <<SENDER_ADDRESS>> with the letterhead address
115
+ for paragraph in doc.paragraphs:
116
+ if ("<<SENDER_ADDRESS>>" in paragraph.text and letterhead_address):
117
+ for run in paragraph.runs:
118
+ run.text = run.text.replace("<<SENDER_ADDRESS>>", letterhead_address)
119
+
120
+ # Add the new text at the end of the document
121
+ doc.add_paragraph(text_to_insert)
122
+
123
+ # Add the image at the end of the document with position adjustment
124
+ if (include_signature and full_image_path):
125
+ image_paragraph = doc.add_paragraph()
126
+ run = image_paragraph.add_run()
127
+ run.add_picture(full_image_path, width=Inches(1), height=Inches(1))
128
+
129
+ # Adjust the alignment based on signature_position
130
+ if signature_position == 'left':
131
+ image_paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
132
+ elif signature_position == 'right':
133
+ image_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
134
+ elif signature_position == 'center':
135
+ image_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
136
+
137
+ # Save the document with the inserted text and image
138
+ doc.save(full_path)
139
+
140
+ return full_path
141
+ except Exception as e:
142
+ print("Error while inserting text:", e)
143
+ return jsonify({"error": str(e)}), 500
144
+
145
+
146
+ def fetch_image(bucket_name: str, image_path: str):
147
+ try:
148
+ # Download file from Supabase storage
149
+ file_data = supabase.storage.from_(bucket_name).download(image_path)
150
+
151
+ # Use python-magic to detect MIME type from the file data
152
+ mime_type = magic.Magic(mime=True).from_buffer(file_data)
153
+
154
+ current_directory = os.path.dirname(os.path.abspath(__file__))
155
+
156
+ os.makedirs('letterhead', exist_ok=True)
157
+
158
+ letterhead_image_path = image_path.split('/')[-1] + "." + mime_type.split('/')[-1]
159
+ full_letterhead_image_path = os.path.join(current_directory, "letterhead", letterhead_image_path)
160
+
161
+ with open(full_letterhead_image_path, 'wb') as f:
162
+ f.write(file_data)
163
+
164
+ return full_letterhead_image_path
165
+
166
+ except Exception as e:
167
+ print(f"Error: {e}")
168
+ return jsonify({"error": str(e)}), 500
169
+
170
+ def delete_all_files(directory):
171
+ keep_file = "WARNING-DO-NOT-DELETE.txt"
172
+ try:
173
+ # Loop through each file in the directory
174
+ for filename in os.listdir(directory):
175
+ file_path = os.path.join(directory, filename)
176
+
177
+ # Check if it's a file and not the one to keep
178
+ if os.path.isfile(file_path) and filename != keep_file:
179
+ os.remove(file_path)
180
+ except Exception as e:
181
+ print(f"An error occurred: {e}")
182
+ return jsonify({"error": str(e)}), 500
183
+
184
+ @app.route("/api/letterhead", methods=["POST"])
185
+ def letterhead():
186
+ data = request.get_json()
187
+ try:
188
+ # Log the data instead of saving it to a file
189
+ logging.info("Received Data: %s", data)
190
+ except Exception as e:
191
+ return jsonify({"error": str(e)}), 500
192
+
193
+
194
+ # Extract data
195
+ chat_settings = data.get("chatSettings")
196
+ profile = data.get("profile")
197
+ letterhead_data = data.get("letterheadData")
198
+
199
+ try:
200
+ file_name, file_data = get_file_by_id(chat_settings["letterheadFileId"])
201
+ current_directory = os.path.dirname(os.path.abspath(__file__))
202
+ full_letterhead_file_path = os.path.join(current_directory, "letterhead", file_name)
203
+
204
+ full_letterhead_signature_path = None
205
+ if (letterhead_data["includeSignature"] and (chat_settings["letterheadSignatureImagePath"])):
206
+ full_letterhead_signature_path = fetch_image("assistant_images", (chat_settings["letterheadSignatureImagePath"]))
207
+
208
+ text_to_insert = letterhead_data["letterheadContent"]
209
+ include_signature = letterhead_data["includeSignature"]
210
+ signature_position = letterhead_data["signaturePosition"]
211
+ letterhead_address = letterhead_data["letterheadAddress"]
212
+
213
+ with open(full_letterhead_file_path, "wb") as f:
214
+ if hasattr(file_data, "read"):
215
+ f.write(file_data.read())
216
+ else: # If it's raw bytes
217
+ f.write(file_data)
218
+
219
+ modified_doc = insert_text_and_image_at_end(full_letterhead_file_path, full_letterhead_signature_path, text_to_insert, include_signature,signature_position, letterhead_address)
220
+
221
+ created_file = insert_file_record(profile["user_id"], modified_doc)
222
+
223
+ file_data = created_file.json()
224
+ file_data_json = json.loads(file_data)
225
+
226
+ print("file data: ", file_data);
227
+ print("file data json", file_data_json)
228
+ print("modified doc", modified_doc)
229
+
230
+ file_path = upload_file_to_storage(modified_doc, {
231
+ "name": file_data_json["data"][0]["name"],
232
+ "user_id": file_data_json["data"][0]["user_id"],
233
+ "file_id": file_data_json["data"][0]["id"],
234
+ })
235
+
236
+ print("file path: ", file_path)
237
+
238
+ update_file_record(file_data_json["data"][0]["id"], {"file_path": file_path})
239
+
240
+ current_directory = os.path.dirname(os.path.abspath(__file__))
241
+ file_deleting_directory_path = os.path.join(current_directory, "letterhead")
242
+ delete_all_files(file_deleting_directory_path)
243
+
244
+ message = f"letterheadFileId:{file_data_json['data'][0]['id']} Your letterhead is successfully created."
245
+ return jsonify({ "message": message }), 200
246
+ except ValueError as e:
247
+ return jsonify({"error": str(e)}), 404
248
+ # except Exception as e:
249
+ # return jsonify({"error": str(e)}), 500
250
+
251
+ if __name__ == '__main__':
252
+ app.run(debug=True)
253
+ print('working')