msmhmorsi commited on
Commit
68f98f8
·
1 Parent(s): 756da27

change to v1

Browse files
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ AZURE_FORM_RECOGNIZER_KEY=8PyYQxSy5oOghAYincAL95bIdJ6ppPaZHiOydPgyW8V66mOPJEz7JQQJ99ALAC3pKaRXJ3w3AAALACOGVy59
__pycache__/image_enhance.cpython-310.pyc ADDED
Binary file (3.79 kB). View file
 
__pycache__/image_route.cpython-310.pyc ADDED
Binary file (3.79 kB). View file
 
__pycache__/pdf_route.cpython-310.pyc ADDED
Binary file (11.5 kB). View file
 
__pycache__/pdf_to_md.cpython-310.pyc ADDED
Binary file (5.6 kB). View file
 
app.py CHANGED
@@ -1,13 +1,9 @@
1
- import cv2
2
- import fitz
3
- import numpy as np
4
- from io import BytesIO
5
- import matplotlib.pyplot as plt
6
- from skimage.color import rgb2gray
7
- from skimage.measure import label, regionprops
8
- from fastapi.responses import StreamingResponse
9
- from fastapi.middleware.cors import CORSMiddleware
10
  from fastapi import FastAPI, UploadFile, File, HTTPException
 
 
 
 
 
11
 
12
  app = FastAPI(
13
  title="PDF Processing API",
@@ -24,133 +20,9 @@ app.add_middleware(
24
  allow_headers=["*"], # Allows all headers
25
  )
26
 
27
-
28
- def convert_and_process_pdf(pdf_content: bytes, area_threshold: int = 100) -> BytesIO:
29
- """
30
- Convert the first page of a PDF to a PNG and apply image enhancement.
31
- Args:
32
- pdf_content: The PDF file content as bytes.
33
- area_threshold: Threshold for area filtering (default: 100).
34
- Returns:
35
- BytesIO: Enhanced PNG image content.
36
- """
37
- # Open the PDF from bytes
38
- doc = fitz.open(stream=pdf_content, filetype="pdf")
39
-
40
- # Load the first page
41
- page = doc.load_page(0)
42
-
43
- # Render the page as an image
44
- pix = page.get_pixmap(dpi=300)
45
- png_image = pix.tobytes("png")
46
-
47
- # Load the image with OpenCV
48
- np_array = np.frombuffer(png_image, dtype=np.uint8)
49
- img = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
50
-
51
- # Convert to grayscale
52
- img_gray = rgb2gray(img)
53
-
54
- # Convert grayscale to binary using Otsu's threshold
55
- _, img_binary = cv2.threshold((img_gray * 255).astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
56
-
57
- # Invert the binary image
58
- img_binary = ~img_binary
59
-
60
- # Label connected components
61
- label_img = label(img_binary)
62
- regions = regionprops(label_img)
63
-
64
- # Filter by area threshold
65
- valid_labels = [region.label for region in regions if region.area >= area_threshold]
66
- img_filtered = np.isin(label_img, valid_labels)
67
-
68
- # Save enhanced image to memory
69
- output_buffer = BytesIO()
70
- plt.imsave(output_buffer, ~img_filtered, cmap="gray", format="png")
71
- output_buffer.seek(0)
72
- return output_buffer
73
-
74
- @app.post("/process-pdf/")
75
- async def process_pdf(
76
- file: UploadFile = File(...),
77
- area_threshold: int = 100
78
- ):
79
- """
80
- Process a PDF file and return an enhanced PNG image.
81
- Args:
82
- file: The PDF file to process
83
- area_threshold: Threshold for area filtering (default: 100)
84
- Returns:
85
- StreamingResponse: Enhanced PNG image
86
- """
87
- try:
88
- # Read PDF file content
89
- pdf_content = await file.read()
90
-
91
- # Process the PDF and get the enhanced image
92
- enhanced_image = convert_and_process_pdf(pdf_content, area_threshold)
93
-
94
- # Return the processed image as a StreamingResponse
95
- return StreamingResponse(
96
- enhanced_image,
97
- media_type="image/png",
98
- headers={"Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}_enhanced.png"}
99
- )
100
- except Exception as e:
101
- raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
102
-
103
- @app.post("/process-image/")
104
- async def process_image(
105
- file: UploadFile = File(...),
106
- area_threshold: int = 100
107
- ):
108
- """
109
- Process an image file and return an enhanced image.
110
- Args:
111
- file: The image file to process
112
- area_threshold: Threshold for area filtering (default: 100)
113
- Returns:
114
- StreamingResponse: Enhanced image
115
- """
116
- try:
117
- # Read image file content
118
- image_content = await file.read()
119
-
120
- # Convert to numpy array
121
- np_array = np.frombuffer(image_content, dtype=np.uint8)
122
- img = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
123
-
124
- # Convert to grayscale
125
- img_gray = rgb2gray(img)
126
-
127
- # Convert grayscale to binary using Otsu's threshold
128
- _, img_binary = cv2.threshold((img_gray * 255).astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
129
-
130
- # Invert the binary image
131
- img_binary = ~img_binary
132
-
133
- # Label connected components
134
- label_img = label(img_binary)
135
- regions = regionprops(label_img)
136
-
137
- # Filter by area threshold
138
- valid_labels = [region.label for region in regions if region.area >= area_threshold]
139
- img_filtered = np.isin(label_img, valid_labels)
140
-
141
- # Save enhanced image to memory
142
- output_buffer = BytesIO()
143
- plt.imsave(output_buffer, ~img_filtered, cmap="gray", format="png")
144
- output_buffer.seek(0)
145
-
146
- # Return the processed image as a StreamingResponse
147
- return StreamingResponse(
148
- output_buffer,
149
- media_type="image/png",
150
- headers={"Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}_enhanced.png"}
151
- )
152
- except Exception as e:
153
- raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
154
 
155
  if __name__ == "__main__":
156
  import uvicorn
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+
4
+ # Import routers
5
+ from image_route import router as image_enhance_router
6
+ from pdf_route import router as pdf_to_md_router
7
 
8
  app = FastAPI(
9
  title="PDF Processing API",
 
20
  allow_headers=["*"], # Allows all headers
21
  )
22
 
23
+ # Include routers
24
+ app.include_router(image_enhance_router, prefix="/image", tags=["image"])
25
+ app.include_router(pdf_to_md_router, prefix="/pdf", tags=["pdf"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  if __name__ == "__main__":
28
  import uvicorn
image_route.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import fitz
3
+ import numpy as np
4
+ from io import BytesIO
5
+ import matplotlib.pyplot as plt
6
+ from skimage.color import rgb2gray
7
+ from skimage.measure import label, regionprops
8
+ from fastapi import APIRouter, UploadFile, File, HTTPException
9
+ from fastapi.responses import StreamingResponse
10
+
11
+ router = APIRouter()
12
+
13
+ def convert_and_process_pdf(pdf_content: bytes, area_threshold: int = 100) -> BytesIO:
14
+ """
15
+ Convert the first page of a PDF to a PNG and apply image enhancement.
16
+ Args:
17
+ pdf_content: The PDF file content as bytes.
18
+ area_threshold: Threshold for area filtering (default: 100).
19
+ Returns:
20
+ BytesIO: Enhanced PNG image content.
21
+ """
22
+ # Open the PDF from bytes
23
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
24
+
25
+ # Load the first page
26
+ page = doc.load_page(0)
27
+
28
+ # Render the page as an image
29
+ pix = page.get_pixmap(dpi=300)
30
+ png_image = pix.tobytes("png")
31
+
32
+ # Load the image with OpenCV
33
+ np_array = np.frombuffer(png_image, dtype=np.uint8)
34
+ img = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
35
+
36
+ # Convert to grayscale
37
+ img_gray = rgb2gray(img)
38
+
39
+ # Convert grayscale to binary using Otsu's threshold
40
+ _, img_binary = cv2.threshold((img_gray * 255).astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
41
+
42
+ # Invert the binary image
43
+ img_binary = ~img_binary
44
+
45
+ # Label connected components
46
+ label_img = label(img_binary)
47
+ regions = regionprops(label_img)
48
+
49
+ # Filter by area threshold
50
+ valid_labels = [region.label for region in regions if region.area >= area_threshold]
51
+ img_filtered = np.isin(label_img, valid_labels)
52
+
53
+ # Save enhanced image to memory
54
+ output_buffer = BytesIO()
55
+ plt.imsave(output_buffer, ~img_filtered, cmap="gray", format="png")
56
+ output_buffer.seek(0)
57
+ return output_buffer
58
+
59
+ @router.post("/process-pdf/")
60
+ async def process_pdf(
61
+ file: UploadFile = File(...),
62
+ area_threshold: int = 100
63
+ ):
64
+ """
65
+ Process a PDF file and return an enhanced PNG image.
66
+ Args:
67
+ file: The PDF file to process
68
+ area_threshold: Threshold for area filtering (default: 100)
69
+ Returns:
70
+ StreamingResponse: Enhanced PNG image
71
+ """
72
+ try:
73
+ # Read PDF file content
74
+ pdf_content = await file.read()
75
+
76
+ # Process the PDF and get the enhanced image
77
+ enhanced_image = convert_and_process_pdf(pdf_content, area_threshold)
78
+
79
+ # Return the processed image as a StreamingResponse
80
+ return StreamingResponse(
81
+ enhanced_image,
82
+ media_type="image/png",
83
+ headers={"Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}_enhanced.png"}
84
+ )
85
+ except Exception as e:
86
+ raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
87
+
88
+ @router.post("/process-image/")
89
+ async def process_image(
90
+ file: UploadFile = File(...),
91
+ area_threshold: int = 100
92
+ ):
93
+ """
94
+ Process an image file and return an enhanced image.
95
+ Args:
96
+ file: The image file to process
97
+ area_threshold: Threshold for area filtering (default: 100)
98
+ Returns:
99
+ StreamingResponse: Enhanced image
100
+ """
101
+ try:
102
+ # Read image file content
103
+ image_content = await file.read()
104
+
105
+ # Convert to numpy array
106
+ np_array = np.frombuffer(image_content, dtype=np.uint8)
107
+ img = cv2.imdecode(np_array, cv2.IMREAD_COLOR)
108
+
109
+ # Convert to grayscale
110
+ img_gray = rgb2gray(img)
111
+
112
+ # Convert grayscale to binary using Otsu's threshold
113
+ _, img_binary = cv2.threshold((img_gray * 255).astype(np.uint8), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
114
+
115
+ # Invert the binary image
116
+ img_binary = ~img_binary
117
+
118
+ # Label connected components
119
+ label_img = label(img_binary)
120
+ regions = regionprops(label_img)
121
+
122
+ # Filter by area threshold
123
+ valid_labels = [region.label for region in regions if region.area >= area_threshold]
124
+ img_filtered = np.isin(label_img, valid_labels)
125
+
126
+ # Save enhanced image to memory
127
+ output_buffer = BytesIO()
128
+ plt.imsave(output_buffer, ~img_filtered, cmap="gray", format="png")
129
+ output_buffer.seek(0)
130
+
131
+ # Return the processed image as a StreamingResponse
132
+ return StreamingResponse(
133
+ output_buffer,
134
+ media_type="image/png",
135
+ headers={"Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}_enhanced.png"}
136
+ )
137
+ except Exception as e:
138
+ raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}")
pdf_route.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from io import BytesIO
3
+ import pandas as pd
4
+ from fastapi import APIRouter, UploadFile, File, HTTPException
5
+ from fastapi.responses import StreamingResponse, JSONResponse
6
+ from azure.core.credentials import AzureKeyCredential
7
+ from azure.ai.formrecognizer import DocumentAnalysisClient
8
+ from dotenv import load_dotenv
9
+ from docx import Document
10
+ import re
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ router = APIRouter()
16
+
17
+ @router.post("/convert-to-markdown")
18
+ async def convert_to_markdown(file: UploadFile = File(...)):
19
+ """
20
+ Convert a PDF file to markdown format.
21
+ Args:
22
+ file: The PDF file to convert
23
+ Returns:
24
+ StreamingResponse: Markdown file
25
+ """
26
+ try:
27
+ # Read the uploaded file content
28
+ content = await file.read()
29
+
30
+ # Save the content to a temporary file
31
+ temp_pdf_path = "temp.pdf"
32
+ with open(temp_pdf_path, "wb") as f:
33
+ f.write(content)
34
+
35
+ # Analyze the document
36
+ result = analyze_document(temp_pdf_path)
37
+
38
+ # Create markdown file
39
+ temp_md_path = "temp.md"
40
+ create_markdown_file(result, temp_md_path)
41
+
42
+ # Read the markdown file
43
+ with open(temp_md_path, "rb") as f:
44
+ markdown_content = f.read()
45
+
46
+ # Clean up temporary files
47
+ os.remove(temp_pdf_path)
48
+ os.remove(temp_md_path)
49
+
50
+ # Return the markdown file as a download
51
+ return StreamingResponse(
52
+ BytesIO(markdown_content),
53
+ media_type="text/markdown",
54
+ headers={
55
+ "Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}.md"
56
+ }
57
+ )
58
+
59
+ except Exception as e:
60
+ raise HTTPException(status_code=500, detail=str(e))
61
+
62
+ @router.post("/convert-to-excel")
63
+ async def convert_to_excel(file: UploadFile = File(...)):
64
+ """
65
+ Convert tables from markdown to Excel format.
66
+ Args:
67
+ file: The markdown file to convert
68
+ Returns:
69
+ StreamingResponse: Excel file containing all tables
70
+ """
71
+ try:
72
+ # Read the markdown content
73
+ content = await file.read()
74
+ markdown_text = content.decode('utf-8')
75
+
76
+ # Extract tables from markdown
77
+ tables = extract_tables_from_markdown(markdown_text)
78
+
79
+ if not tables:
80
+ raise HTTPException(status_code=400, detail="No tables found in the markdown content")
81
+
82
+ # Create Excel file
83
+ excel_buffer = create_excel_from_markdown_tables(tables)
84
+
85
+ # Return the Excel file as a download
86
+ return StreamingResponse(
87
+ excel_buffer,
88
+ media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
89
+ headers={
90
+ "Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}_tables.xlsx"
91
+ }
92
+ )
93
+
94
+ except Exception as e:
95
+ raise HTTPException(status_code=500, detail=str(e))
96
+
97
+ @router.post("/convert-to-word")
98
+ async def convert_to_word(file: UploadFile = File(...)):
99
+ """
100
+ Convert markdown to Word document format.
101
+ Args:
102
+ file: The markdown file to convert
103
+ Returns:
104
+ StreamingResponse: Word document file
105
+ """
106
+ try:
107
+ # Read the markdown content
108
+ content = await file.read()
109
+ markdown_text = content.decode('utf-8')
110
+
111
+ # Create Word file
112
+ temp_docx_path = "temp.docx"
113
+ create_word_from_markdown(markdown_text, temp_docx_path)
114
+
115
+ # Read the Word file
116
+ with open(temp_docx_path, "rb") as f:
117
+ word_content = f.read()
118
+
119
+ # Clean up temporary file
120
+ os.remove(temp_docx_path)
121
+
122
+ # Return the Word file as a download
123
+ return StreamingResponse(
124
+ BytesIO(word_content),
125
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
126
+ headers={
127
+ "Content-Disposition": f"attachment; filename={file.filename.rsplit('.', 1)[0]}.docx"
128
+ }
129
+ )
130
+
131
+ except Exception as e:
132
+ raise HTTPException(status_code=500, detail=str(e))
133
+
134
+ def analyze_document(file_path):
135
+ """Analyze document using Azure Form Recognizer"""
136
+ endpoint = "https://aal-ocr-ai-azureapi.cognitiveservices.azure.com/"
137
+ key = os.getenv("AZURE_FORM_RECOGNIZER_KEY")
138
+
139
+ document_analysis_client = DocumentAnalysisClient(
140
+ endpoint=endpoint, credential=AzureKeyCredential(key)
141
+ )
142
+
143
+ with open(file_path, "rb") as f:
144
+ poller = document_analysis_client.begin_analyze_document(
145
+ "prebuilt-layout", document=f
146
+ )
147
+
148
+ result = poller.result()
149
+ return result
150
+
151
+ def extract_tables_from_markdown(markdown_text):
152
+ """Extract tables from markdown text"""
153
+ tables = []
154
+ current_table = []
155
+
156
+ lines = markdown_text.split('\n')
157
+ in_table = False
158
+
159
+ for line in lines:
160
+ if '|' in line:
161
+ # Skip separator lines (e.g., |---|---|)
162
+ if re.match(r'^[\s|:-]+$', line):
163
+ continue
164
+
165
+ # Process table row
166
+ cells = [cell.strip() for cell in line.split('|')[1:-1]]
167
+ if cells:
168
+ if not in_table:
169
+ in_table = True
170
+ current_table.append(cells)
171
+ else:
172
+ if in_table:
173
+ if current_table:
174
+ tables.append(current_table)
175
+ current_table = []
176
+ in_table = False
177
+
178
+ # Add the last table if exists
179
+ if current_table:
180
+ tables.append(current_table)
181
+
182
+ return tables
183
+
184
+ def create_excel_from_markdown_tables(tables):
185
+ """Create Excel file from markdown tables"""
186
+ excel_buffer = BytesIO()
187
+
188
+ with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
189
+ for i, table in enumerate(tables):
190
+ if table:
191
+ # Convert table to DataFrame
192
+ df = pd.DataFrame(table[1:], columns=table[0])
193
+
194
+ # Save to Excel sheet
195
+ sheet_name = f"Table_{i+1}"
196
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
197
+
198
+ excel_buffer.seek(0)
199
+ return excel_buffer
200
+
201
+ def create_word_from_markdown(markdown_text, output_file):
202
+ """Create Word document from markdown text"""
203
+ doc = Document()
204
+
205
+ lines = markdown_text.split('\n')
206
+ current_table = []
207
+ in_table = False
208
+
209
+ for line in lines:
210
+ # Handle headers
211
+ if line.startswith('#'):
212
+ level = len(line.split()[0]) # Count the number of '#'
213
+ text = line.lstrip('#').strip()
214
+ doc.add_heading(text, level=min(level, 9))
215
+
216
+ # Handle tables
217
+ elif '|' in line:
218
+ # Skip separator lines
219
+ if re.match(r'^[\s|:-]+$', line):
220
+ continue
221
+
222
+ # Process table row
223
+ cells = [cell.strip() for cell in line.split('|')[1:-1]]
224
+ if cells:
225
+ if not in_table:
226
+ in_table = True
227
+ current_table = []
228
+ current_table.append(cells)
229
+
230
+ # Handle end of table
231
+ elif in_table:
232
+ if current_table:
233
+ table = doc.add_table(rows=len(current_table), cols=len(current_table[0]))
234
+ table.style = 'Table Grid'
235
+
236
+ for i, row in enumerate(current_table):
237
+ for j, cell in enumerate(row):
238
+ table.cell(i, j).text = cell
239
+
240
+ doc.add_paragraph() # Add space after table
241
+ current_table = []
242
+ in_table = False
243
+
244
+ # Handle checkbox lists
245
+ elif line.strip().startswith('- ['):
246
+ p = doc.add_paragraph()
247
+ run = p.add_run()
248
+ if 'x' in line or 'X' in line:
249
+ run.add_text("☑ " + line[5:].strip())
250
+ else:
251
+ run.add_text("☐ " + line[5:].strip())
252
+
253
+ # Handle regular paragraphs
254
+ elif line.strip():
255
+ doc.add_paragraph(line.strip())
256
+
257
+ # Handle the last table if exists
258
+ if in_table and current_table:
259
+ table = doc.add_table(rows=len(current_table), cols=len(current_table[0]))
260
+ table.style = 'Table Grid'
261
+
262
+ for i, row in enumerate(current_table):
263
+ for j, cell in enumerate(row):
264
+ table.cell(i, j).text = cell
265
+
266
+ doc.save(output_file)
267
+
268
+ def create_markdown_file(result, output_file):
269
+ """Create markdown file from analysis result"""
270
+ with open(output_file, 'w', encoding='utf-8') as md_file:
271
+ for page in result.pages:
272
+ # md_file.write(f"### Page {page.page_number}\n\n")
273
+
274
+ elements = []
275
+ elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.05, 'paragraph', paragraph)
276
+ for paragraph in result.paragraphs if paragraph.bounding_regions[0].page_number == page.page_number])
277
+ elements.sort(key=lambda x: x[0])
278
+
279
+ page_width = page.width / 2
280
+ min_distance = float('inf')
281
+ title_paragraph = None
282
+
283
+ for element in elements[:5]:
284
+ if element[1] == 'paragraph':
285
+ paragraph = element[2]
286
+ midpoint_x = (paragraph.bounding_regions[0].polygon[0].x + paragraph.bounding_regions[0].polygon[1].x) / 2
287
+ midpoint_y = paragraph.bounding_regions[0].polygon[0].y
288
+ distance = ((midpoint_x - page_width) ** 2 + midpoint_y ** 2) ** 0.5
289
+ if distance < min_distance:
290
+ min_distance = distance
291
+ title_paragraph = paragraph
292
+
293
+ if title_paragraph:
294
+ elements = [element for element in elements if element[2] != title_paragraph]
295
+ md_file.write(f"# {title_paragraph.content}\n\n")
296
+
297
+ elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.05, 'table', table)
298
+ for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
299
+ elements.extend([(mark.polygon[0].y + mark.polygon[0].x*0.05, 'selection_mark', mark) for mark in page.selection_marks])
300
+
301
+ elements.sort(key=lambda x: x[0])
302
+
303
+ table_cells = set()
304
+ for _, element_type, element in elements:
305
+ if element_type == 'paragraph':
306
+ if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
307
+ continue
308
+ md_file.write(f"{element.content}\n\n")
309
+
310
+ elif element_type == 'table':
311
+ for row_idx in range(element.row_count):
312
+ row_content = "| "
313
+ for col_idx in range(element.column_count):
314
+ cell_content = ""
315
+ for cell in element.cells:
316
+ if cell.row_index == row_idx and cell.column_index == col_idx:
317
+ cell_content = cell.content
318
+ table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
319
+ break
320
+ row_content += f"{cell_content} | "
321
+ md_file.write(row_content + "\n")
322
+ md_file.write("\n")
323
+
324
+ elif element_type == 'selection_mark':
325
+ if element.state == "selected":
326
+ md_file.write("- [x] \n\n")
327
+ else:
328
+ md_file.write("- [ ] \n\n")
329
+
330
+ def create_word_file(result, output_file):
331
+ """Create Word document from analysis result"""
332
+ # Create a new Word document
333
+ doc = Document()
334
+
335
+ # Analyze pages
336
+ for page in result.pages:
337
+ # Combine paragraphs, tables, and selection marks in the order they appear on the page
338
+ elements = []
339
+ elements.extend([(paragraph.bounding_regions[0].polygon[0].y + paragraph.bounding_regions[0].polygon[0].x*0.01, 'paragraph', paragraph)
340
+ for paragraph in result.paragraphs if paragraph.bounding_regions[0].page_number == page.page_number])
341
+ elements.sort(key=lambda x: x[0])
342
+
343
+ # Find the paragraph which is possible to be document title
344
+ page_width = page.width / 2
345
+ min_distance = float('inf')
346
+ title_paragraph = None
347
+
348
+ for element in elements[:5]:
349
+ if element[1] == 'paragraph':
350
+ paragraph = element[2]
351
+ midpoint_x = (paragraph.bounding_regions[0].polygon[0].x + paragraph.bounding_regions[0].polygon[1].x) / 2
352
+ midpoint_y = paragraph.bounding_regions[0].polygon[0].y
353
+ distance = ((midpoint_x - page_width) ** 2 + midpoint_y ** 2) ** 0.5
354
+ if distance < min_distance:
355
+ min_distance = distance
356
+ title_paragraph = paragraph
357
+
358
+ if title_paragraph:
359
+ elements = [element for element in elements if element[2] != title_paragraph]
360
+ doc.add_heading(title_paragraph.content, level=1)
361
+
362
+ # Continuous combine paragraphs, tables, and selection marks in the order they appear on the page
363
+ elements.extend([(table.bounding_regions[0].polygon[0].y + table.bounding_regions[0].polygon[0].x*0.01, 'table', table)
364
+ for table in result.tables if table.bounding_regions[0].page_number == page.page_number])
365
+ elements.extend([(mark.polygon[0].y + mark.polygon[0].x*0.01, 'selection_mark', mark)
366
+ for mark in page.selection_marks])
367
+
368
+ # Sort elements by the sum of their horizontal and vertical positions on the page
369
+ elements.sort(key=lambda x: x[0])
370
+
371
+ # Track table cells to avoid duplicating content
372
+ table_cells = set()
373
+ for _, element_type, element in elements:
374
+ if element_type == 'paragraph':
375
+ # Skip lines that are part of a table
376
+ if any(is_element_inside_table(element, get_table_max_polygon(table)) for table in result.tables):
377
+ continue
378
+ doc.add_paragraph(element.content)
379
+ elif element_type == 'table':
380
+ table = doc.add_table(rows=element.row_count, cols=element.column_count)
381
+ table.style = 'Table Grid'
382
+ for row_idx in range(element.row_count):
383
+ row_cells = table.rows[row_idx].cells
384
+ for col_idx in range(element.column_count):
385
+ cell_content = ""
386
+ for cell in element.cells:
387
+ if cell.row_index == row_idx and cell.column_index == col_idx:
388
+ cell_content = cell.content
389
+ table_cells.add((cell.bounding_regions[0].polygon[0].x, cell.bounding_regions[0].polygon[0].y))
390
+ break
391
+ row_cells[col_idx].text = cell_content
392
+ elif element_type == 'selection_mark':
393
+ p = doc.add_paragraph()
394
+ run = p.add_run()
395
+ if element.state == "selected":
396
+ run.add_text("☑ ")
397
+ else:
398
+ run.add_text("☐ ")
399
+
400
+ # Save Word document
401
+ doc.save(output_file)
402
+
403
+ def format_polygon(polygon):
404
+ """Format polygon coordinates to string"""
405
+ if not polygon:
406
+ return "N/A"
407
+ return ", ".join([f"[{p.x}, {p.y}]" for p in polygon])
408
+
409
+ def get_table_max_polygon(table):
410
+ """Get the maximum polygon coordinates for a table"""
411
+ first_cell = table.cells[0]
412
+ first_coordinate = first_cell.bounding_regions[0].polygon[0]
413
+ last_cell = table.cells[-1]
414
+ last_coordinate = last_cell.bounding_regions[0].polygon[-1]
415
+ return [first_coordinate, last_coordinate]
416
+
417
+ def is_element_inside_table(element, table_max_polygon):
418
+ """Check if an element is inside a table"""
419
+ element_x = element.bounding_regions[0].polygon[0].x
420
+ element_y = element.bounding_regions[0].polygon[0].y
421
+ first_coordinate = table_max_polygon[0]
422
+ last_coordinate = table_max_polygon[1]
423
+
424
+ return (first_coordinate.x <= element_x <= last_coordinate.x and
425
+ first_coordinate.y <= element_y <= last_coordinate.y)
requirements.txt CHANGED
@@ -6,3 +6,6 @@ opencv-python==4.8.1.78
6
  numpy==1.26.2
7
  scikit-image==0.22.0
8
  matplotlib==3.8.2
 
 
 
 
6
  numpy==1.26.2
7
  scikit-image==0.22.0
8
  matplotlib==3.8.2
9
+ azure-ai-formrecognizer==3.3.0
10
+ python-dotenv==1.0.0
11
+ python-docx==1.1.0