Spaces:

zmbfeng
/

locked_pdf_ingestion

Sleeping

App Files Files Community

zmbfeng commited on May 25, 2024

Commit

c6269e3

1 Parent(s): 8dfad76

segments divided by line now with green outlines

Browse files

Files changed (1) hide show

utils.py +78 -1

utils.py CHANGED Viewed

@@ -208,6 +208,55 @@ def extract_bounding_boxes_from_image_np(image_np, bounding_boxes_list, above_ch
     # else:
     #   print("box="+str(box)+"filled")
   return rect_content_list,above_rect_content_list,  figures_image_list, tables_image_list, image_np_copy
 def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
     bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
     bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
@@ -240,4 +289,32 @@ def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
             print(table[0])
             # st.write(table[0])#to_be_displayed
             # st.image(Image.fromarray(table[1]))#to_be_displayed
-        st.image(Image.fromarray(cropped_image))

     # else:
     #   print("box="+str(box)+"filled")
   return rect_content_list,above_rect_content_list,  figures_image_list, tables_image_list, image_np_copy
+def find_hor_lines_in_image_np(min_width, min_height,image_np):
+  # Apply a horizontal kernel to emphasize horizontal lines
+  kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1050, 5))  # Adjust size according to your document
+  morphed = cv2.morphologyEx(image_np, cv2.MORPH_CLOSE, kernel)
+  # Detect edges
+  edges = cv2.Canny(morphed, 50, 150, apertureSize=3)
+  # Detect lines using HoughLinesP
+  lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10)  # Adjust parameters as needed
+  return lines
+def draw_colored_lines_on_image_np(image, lines,color_tuple):
+  for line  in lines:
+      x1, y1, x2, y2 = line[0]
+      cv2.line(image, (x1, y1), (x2, y2), color_tuple, 3)
+def segment_image_np(image_np,hor_lines_list):
+  # print("in segment_image_np image_np start")
+  # display_image_np(image_np)
+  # print("in segment_image_np image_np end")
+  segments = []
+  previous_y = 0
+  for line in sorted(hor_lines_list, key=lambda x: x[0][1]):  # Sort lines by their y-coordinate
+      x1, y1, x2, y2 = line[0]
+      segment = image_np[previous_y:y1, :]
+      segments.append(segment)
+      previous_y = y2  # Update to start the next segment from the end of the current line
+  # Don't forget the last segment
+  last_segment =image_np[previous_y:, :]
+  segments.append(last_segment)
+  return segments
+def filter_segments_by_min_height(segments, min_height):
+    return [segment for segment in segments if segment.shape[0] > min_height]
+def draw_edges(np_image):
+    color = (0, 255, 0)  # Green
+    # Define the thickness of the rectangle lines
+    thickness = 5
+    # Get the dimensions of the image
+    height, width = np_image.shape[:2]
+    # Coordinates for the rectangle: start from (0,0) to (width, height)
+    # We draw from 0+thickness//2 and width-thickness//2 to respect the thickness and not go out of bounds
+    cv2.rectangle(np_image, (thickness // 2, thickness // 2), (width - thickness // 2, height - thickness // 2), color,
+                  thickness)
 def gray_pdf_image_np_to_text(image_index,gray_pdf_image_np, debug=False):
     bounding_boxes_list = extract_rectangle_from_image(gray_pdf_image_np, 500, 20)
     bounding_boxes_list = remove_close_boxes (bounding_boxes_list, 10)
             print(table[0])
             # st.write(table[0])#to_be_displayed
             # st.image(Image.fromarray(table[1]))#to_be_displayed
+        st.image(Image.fromarray(cropped_image))#to_be_displayed
+    found_hor_lines_list = find_hor_lines_in_image_np(1050, 5, cropped_image)
+    if found_hor_lines_list is not None:
+        bgr_image = cv2.cvtColor(gray_pdf_image_np, cv2.COLOR_GRAY2BGR)
+        draw_colored_lines_on_image_np(bgr_image, found_hor_lines_list, (0, 255, 0))
+        print("detected Lines start")
+        st.image(Image.fromarray(bgr_image)) #to_be_displayed
+        print("detected lines end")
+        page_segment_np_list = segment_image_np(cropped_image, found_hor_lines_list)
+        if debug:
+            debug_page_segment_index = 0
+            for element in page_segment_np_list:
+                print("element start")
+                bgr_image = cv2.cvtColor(element, cv2.COLOR_GRAY2BGR)
+                draw_edges(bgr_image)
+                st.image(Image.fromarray(bgr_image))#to_be_displayed
+                debug_page_segment_index = debug_page_segment_index + 1
+                print("element end")
+        min_height_filtered_page_segment_np_list = filter_segments_by_min_height(page_segment_np_list, 50)
+        max_height_image = max(min_height_filtered_page_segment_np_list, key=lambda image: image.shape[0])
+        if debug:
+            print("max height image start")
+            st.image(Image.fromarray(max_height_image))#to_be_displayed
+            print("max height image end")
+    else:
+        max_height_image = cropped_image.copy()
+    st.write("selected segment")