BMukhtar's picture
new changes
e3a55fa
import streamlit as st
from PIL import Image
import os
import easyocr
import numpy as np
import fitz # PyMuPDF
import io
from pdf2image import convert_from_bytes
#from st_btn_group import st_btn_group
#from streamlit_option_menu import option_menu
import docx
from docx.shared import Pt
from io import BytesIO
#import streamlit.components.v1 as components
import base64
#def downloadTxt():
def generateTxtLink(result):
result_txt = ""
print(result)
for para in result:
result_txt += para[1]+"\n"
result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
return result_txt_link
def generateMultiPageTxtLink(result):
result_txt = ""
print(result)
for para in result:
result_txt += para+"\n"
result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
return result_txt_link
def generateDocLink(result):
doc = docx.Document()
for para in result:
doc.add_paragraph(para[1])
target_stream = BytesIO()
result_doc = doc.save(target_stream)
base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
stlyeCss = ""
doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
return doc_link
def generateMultiPageDocLink(pages_result):
doc = docx.Document()
#print(pages_result)
for page in pages_result:
page_split = page.split("\n")
for para in page_split:
doc.add_paragraph(para)
doc.add_page_break()
target_stream = BytesIO()
result_doc = doc.save(target_stream)
base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
return doc_link
def generateButtonGroup(result):
txtLink = generateTxtLink(result)
docLink = generateDocLink(result)
return txtLink+"\n"+docLink
def generateButtonGroupForPDF(pages_result):
#result = "\n\n".join(pages_result)
txtLink = generateMultiPageTxtLink(pages_result)
docLink = generateMultiPageDocLink(pages_result)
return txtLink+"\n"+docLink
def local_css(file_name):
with open(file_name) as f:
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
models_dir = "./models"
output_dir = "./output"
dirs = [models_dir, output_dir]
for d in dirs:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
font_path = models_dir + "/Ubuntu-Regular.ttf"
reader = easyocr.Reader(
['en'],
gpu=True,
recog_network='best_norm_ED',
detect_network="craft",
user_network_directory=models_dir,
model_storage_directory=models_dir,
) # this needs to run only once to load the model into memory
# main title
st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану")
local_css("app.css")
#st.markdown("<a class='button' href='lenta.ru'>DOCX жүктеп ал</a>",unsafe_allow_html=True)
st.title("Сурет немесе пдф файлдан текст алу")
# subtitle
#st.markdown("## Qazaq OCR")
uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf'])
col1, col2 = st.columns(2)
#def process_page(page):
# image_matrix = fitz.Matrix(fitz.Identity)
# pixmap = page.get_pixmap(matrix=image_matrix, dpi=300)
# image_data = pixmap.samples# This is a bytes object
# image = Image.from("RGB",(pixmap.width, pixmap.height),image_data)
# image = Image.from("RGB", (pixmap.width, pixmap.height), image_data)
# result = reader.readtext(np.array(image),paragraph=True)
# return image, result
import time
max_page = 5
def recognize_page_image(image):
start = time.time()
result = [[0,"Sample 1"],[1,"Sample 2"]]
result = reader.readtext(np.array(image), paragraph=False)
result = get_paragraph(result)
end = time.time()
return result,(end-start)
def process_pdf(uploaded_file):
pdf_document = fitz.open(temp_pdf_file)
total_pages = len(pdf_document)
progress_bar = col2.progress(0, text="Жүктеліп жатыр")
button_group = col2.container()
# clear the container
button_group.empty()
pages = range(min(max_page,total_pages))
tabs = col1.tabs([f"Бет {page+1}" for page in pages])
pages_result = []
for count, page_num in enumerate(range(min(total_pages,max_page))):
page = pdf_document.load_page(page_num)
image_matrix = fitz.Matrix(fitz.Identity)
pixmap = page.get_pixmap(matrix=image_matrix, dpi=300)
image_data = pixmap.samples # This is a bytes object
image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data)
imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10)))
tabs[count].image(imageSmaller)
#buffered = BytesIO()
#imageSmaller.save(buffered,format="JPEG")
#col1.write(f'<h2>Бет {page_num + 1}/{total_pages}</h2>',unsafe_allow_html=True)
#col1.write(f'<img src="data:image/png;base64, {base64.b64encode(buffered.getvalue()).decode("utf-8")}"/>',unsafe_allow_html=True)
#col1.subheader(f'Бет {page_num + 1}/{total_pages}')
#col1.image(imageSmaller, caption=f'Бет {page_num + 1}')
result,time_elapsed = recognize_page_image(image)
expander = col2.expander(f'{result[0][1][:100]} ... **:orange[{time_elapsed:.3f} секундта таңылды]**')
expander.write(f'{result[0][1]}')
result_text = "\n\n".join([item[1] for item in result])
pages_result.append(result_text)
#col2.markdown(result_text)
progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}')
button_group_html = generateButtonGroupForPDF(pages_result)
button_group.write(button_group_html,unsafe_allow_html=True)
#col1.write("</div>",unsafe_allow_html=True)
progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді')
def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'):
# create basic attributes
box_group = []
for box in raw_result:
all_x = [int(coord[0]) for coord in box[0]]
all_y = [int(coord[1]) for coord in box[0]]
min_x = min(all_x)
max_x = max(all_x)
min_y = min(all_y)
max_y = max(all_y)
height = max_y - min_y
box_group.append([box[1], min_x, max_x, min_y, max_y, height, 0.5*(min_y+max_y), 0]) # last element indicates group
# cluster boxes into paragraph
current_group = 1
while len([box for box in box_group if box[7]==0]) > 0:
box_group0 = [box for box in box_group if box[7]==0] # group0 = non-group
# new group
if len([box for box in box_group if box[7]==current_group]) == 0:
box_group0[0][7] = current_group # assign first box to form new group
# try to add group
else:
current_box_group = [box for box in box_group if box[7]==current_group]
mean_height = np.mean([box[5] for box in current_box_group])
min_gx = min([box[1] for box in current_box_group]) - x_ths*mean_height
max_gx = max([box[2] for box in current_box_group]) + x_ths*mean_height
min_gy = min([box[3] for box in current_box_group]) - y_ths*mean_height
max_gy = max([box[4] for box in current_box_group]) + y_ths*mean_height
add_box = False
for box in box_group0:
same_horizontal_level = (min_gx<=box[1]<=max_gx) or (min_gx<=box[2]<=max_gx)
same_vertical_level = (min_gy<=box[3]<=max_gy) or (min_gy<=box[4]<=max_gy)
if same_horizontal_level and same_vertical_level:
box[7] = current_group
add_box = True
break
# cannot add more box, go to next group
if add_box==False:
current_group += 1
# arrage order in paragraph
result = []
for i in set(box[7] for box in box_group):
current_box_group = [box for box in box_group if box[7]==i]
mean_height = np.mean([box[5] for box in current_box_group])
min_gx = min([box[1] for box in current_box_group])
max_gx = max([box[2] for box in current_box_group])
min_gy = min([box[3] for box in current_box_group])
max_gy = max([box[4] for box in current_box_group])
text = ''
while len(current_box_group) > 0:
highest = min([box[6] for box in current_box_group])
candidates = [box for box in current_box_group if box[6]<highest+0.4*mean_height]
# get the far left
if mode == 'ltr':
most_left = min([box[1] for box in candidates])
for box in candidates:
if box[1] == most_left: best_box = box
elif mode == 'rtl':
most_right = max([box[2] for box in candidates])
for box in candidates:
if box[2] == most_right: best_box = box
text += ' '+best_box[0]
current_box_group.remove(best_box)
result.append([ [[min_gx,min_gy],[max_gx,min_gy],[max_gx,max_gy],[min_gx,max_gy]], text[1:]])
return result
if uploaded_file is not None:
if uploaded_file.type == "application/pdf":
placeholder = col2.empty()
with placeholder, st.spinner('PDF өңделуде ...'):
temp_pdf_file = "./temp_pdf_file.pdf"
with open(temp_pdf_file, "wb") as f:
f.write(uploaded_file.read())
process_pdf(uploaded_file)
else:
placeholder = col2.empty()
with placeholder,st.spinner('Сурет өңделуде ...'):
image = Image.open(uploaded_file)
#with open(os.path.join("tempDir",image_file))
col1.image(image)
result = reader.readtext(np.array(image), paragraph=True)
result_text = "\n\n".join([item[1] for item in result])
button_group_html = generateButtonGroup(result)
col2.write(button_group_html, unsafe_allow_html=True)
col2.markdown(result_text)