import streamlit as st from PIL import Image import os import easyocr import numpy as np import fitz # PyMuPDF import io from pdf2image import convert_from_bytes #from st_btn_group import st_btn_group #from streamlit_option_menu import option_menu import docx from docx.shared import Pt from io import BytesIO #import streamlit.components.v1 as components import base64 #def downloadTxt(): def generateTxtLink(result): result_txt = "" print(result) for para in result: result_txt += para[1]+"\n" result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8') result_txt_link = "TXT" return result_txt_link def generateMultiPageTxtLink(result): result_txt = "" print(result) for para in result: result_txt += para+"\n" result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8') result_txt_link = "TXT" return result_txt_link def generateDocLink(result): doc = docx.Document() for para in result: doc.add_paragraph(para[1]) target_stream = BytesIO() result_doc = doc.save(target_stream) base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8') stlyeCss = "" doc_link = "DOCX" return doc_link def generateMultiPageDocLink(pages_result): doc = docx.Document() #print(pages_result) for page in pages_result: page_split = page.split("\n") for para in page_split: doc.add_paragraph(para) doc.add_page_break() target_stream = BytesIO() result_doc = doc.save(target_stream) base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8') doc_link = "DOCX" return doc_link def generateButtonGroup(result): txtLink = generateTxtLink(result) docLink = generateDocLink(result) return txtLink+"\n"+docLink def generateButtonGroupForPDF(pages_result): #result = "\n\n".join(pages_result) txtLink = generateMultiPageTxtLink(pages_result) docLink = generateMultiPageDocLink(pages_result) return txtLink+"\n"+docLink def local_css(file_name): with open(file_name) as f: st.markdown(f'', unsafe_allow_html=True) models_dir = "./models" output_dir = "./output" dirs = [models_dir, output_dir] for d in dirs: if not os.path.exists(output_dir): os.makedirs(output_dir) font_path = models_dir + "/Ubuntu-Regular.ttf" reader = easyocr.Reader( ['en'], gpu=True, recog_network='best_norm_ED', detect_network="craft", user_network_directory=models_dir, model_storage_directory=models_dir, ) # this needs to run only once to load the model into memory # main title st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану") local_css("app.css") #st.markdown("DOCX жүктеп ал",unsafe_allow_html=True) st.title("Сурет немесе пдф файлдан текст алу") # subtitle #st.markdown("## Qazaq OCR") uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf']) col1, col2 = st.columns(2) #def process_page(page): # image_matrix = fitz.Matrix(fitz.Identity) # pixmap = page.get_pixmap(matrix=image_matrix, dpi=300) # image_data = pixmap.samples# This is a bytes object # image = Image.from("RGB",(pixmap.width, pixmap.height),image_data) # image = Image.from("RGB", (pixmap.width, pixmap.height), image_data) # result = reader.readtext(np.array(image),paragraph=True) # return image, result import time max_page = 5 def recognize_page_image(image): start = time.time() result = [[0,"Sample 1"],[1,"Sample 2"]] result = reader.readtext(np.array(image), paragraph=False) result = get_paragraph(result) end = time.time() return result,(end-start) def process_pdf(uploaded_file): pdf_document = fitz.open(temp_pdf_file) total_pages = len(pdf_document) progress_bar = col2.progress(0, text="Жүктеліп жатыр") button_group = col2.container() # clear the container button_group.empty() pages = range(min(max_page,total_pages)) tabs = col1.tabs([f"Бет {page+1}" for page in pages]) pages_result = [] for count, page_num in enumerate(range(min(total_pages,max_page))): page = pdf_document.load_page(page_num) image_matrix = fitz.Matrix(fitz.Identity) pixmap = page.get_pixmap(matrix=image_matrix, dpi=300) image_data = pixmap.samples # This is a bytes object image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data) imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10))) tabs[count].image(imageSmaller) #buffered = BytesIO() #imageSmaller.save(buffered,format="JPEG") #col1.write(f'

Бет {page_num + 1}/{total_pages}

',unsafe_allow_html=True) #col1.write(f'',unsafe_allow_html=True) #col1.subheader(f'Бет {page_num + 1}/{total_pages}') #col1.image(imageSmaller, caption=f'Бет {page_num + 1}') result,time_elapsed = recognize_page_image(image) expander = col2.expander(f'{result[0][1][:100]} ... **:orange[{time_elapsed:.3f} секундта таңылды]**') expander.write(f'{result[0][1]}') result_text = "\n\n".join([item[1] for item in result]) pages_result.append(result_text) #col2.markdown(result_text) progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}') button_group_html = generateButtonGroupForPDF(pages_result) button_group.write(button_group_html,unsafe_allow_html=True) #col1.write("",unsafe_allow_html=True) progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді') def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'): # create basic attributes box_group = [] for box in raw_result: all_x = [int(coord[0]) for coord in box[0]] all_y = [int(coord[1]) for coord in box[0]] min_x = min(all_x) max_x = max(all_x) min_y = min(all_y) max_y = max(all_y) height = max_y - min_y box_group.append([box[1], min_x, max_x, min_y, max_y, height, 0.5*(min_y+max_y), 0]) # last element indicates group # cluster boxes into paragraph current_group = 1 while len([box for box in box_group if box[7]==0]) > 0: box_group0 = [box for box in box_group if box[7]==0] # group0 = non-group # new group if len([box for box in box_group if box[7]==current_group]) == 0: box_group0[0][7] = current_group # assign first box to form new group # try to add group else: current_box_group = [box for box in box_group if box[7]==current_group] mean_height = np.mean([box[5] for box in current_box_group]) min_gx = min([box[1] for box in current_box_group]) - x_ths*mean_height max_gx = max([box[2] for box in current_box_group]) + x_ths*mean_height min_gy = min([box[3] for box in current_box_group]) - y_ths*mean_height max_gy = max([box[4] for box in current_box_group]) + y_ths*mean_height add_box = False for box in box_group0: same_horizontal_level = (min_gx<=box[1]<=max_gx) or (min_gx<=box[2]<=max_gx) same_vertical_level = (min_gy<=box[3]<=max_gy) or (min_gy<=box[4]<=max_gy) if same_horizontal_level and same_vertical_level: box[7] = current_group add_box = True break # cannot add more box, go to next group if add_box==False: current_group += 1 # arrage order in paragraph result = [] for i in set(box[7] for box in box_group): current_box_group = [box for box in box_group if box[7]==i] mean_height = np.mean([box[5] for box in current_box_group]) min_gx = min([box[1] for box in current_box_group]) max_gx = max([box[2] for box in current_box_group]) min_gy = min([box[3] for box in current_box_group]) max_gy = max([box[4] for box in current_box_group]) text = '' while len(current_box_group) > 0: highest = min([box[6] for box in current_box_group]) candidates = [box for box in current_box_group if box[6]