Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PIL import Image | |
import os | |
import easyocr | |
import numpy as np | |
import fitz # PyMuPDF | |
import io | |
from pdf2image import convert_from_bytes | |
#from st_btn_group import st_btn_group | |
#from streamlit_option_menu import option_menu | |
import docx | |
from io import BytesIO | |
#import streamlit.components.v1 as components | |
import base64 | |
line_separator = "\n\n" | |
#def downloadTxt(): | |
def generateTxtLink(result): | |
result_txt = "" | |
print(result) | |
for para in result: | |
result_txt += para[1]+"\n" | |
result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8') | |
result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>" | |
return result_txt_link | |
def generateMultiPageTxtLink(result): | |
result_txt = "" | |
print(result) | |
for para in result: | |
result_txt += para+"\n" | |
result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8') | |
result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>" | |
return result_txt_link | |
def generateDocLink(result): | |
doc = docx.Document() | |
for para in result: | |
doc.add_paragraph(para[1]) | |
target_stream = BytesIO() | |
result_doc = doc.save(target_stream) | |
base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8') | |
stlyeCss = "" | |
doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>" | |
return doc_link | |
def generateMultiPageDocLink(pages_result): | |
doc = docx.Document() | |
#print(pages_result) | |
for page in pages_result: | |
page_split = page.split("\n") | |
for para in page_split: | |
doc.add_paragraph(para) | |
doc.add_page_break() | |
target_stream = BytesIO() | |
result_doc = doc.save(target_stream) | |
base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8') | |
doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>" | |
return doc_link | |
def generateButtonGroup(result): | |
txtLink = generateTxtLink(result) | |
docLink = generateDocLink(result) | |
return txtLink+"\n"+docLink | |
def generateButtonGroupForPDF(pages_result): | |
txtLink = generateMultiPageTxtLink(pages_result) | |
docLink = generateMultiPageDocLink(pages_result) | |
return txtLink+"\n"+docLink | |
def local_css(file_name): | |
with open(file_name) as f: | |
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True) | |
models_dir = "./models" | |
output_dir = "./output" | |
dirs = [models_dir, output_dir] | |
for d in dirs: | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
font_path = models_dir + "/Ubuntu-Regular.ttf" | |
reader = easyocr.Reader( | |
['en'], | |
gpu=True, | |
recog_network='best_norm_ED', | |
detect_network="craft", | |
user_network_directory=models_dir, | |
model_storage_directory=models_dir, | |
) # this needs to run only once to load the model into memory | |
# main title | |
st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану") | |
local_css("app.css") | |
#st.markdown("<a class='button' href='lenta.ru'>DOCX жүктеп ал</a>",unsafe_allow_html=True) | |
st.title("Сурет немесе пдф файлдан текст алу") | |
# subtitle | |
#st.markdown("## Qazaq OCR") | |
uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf']) | |
col1, col2 = st.columns(2) | |
import time | |
max_page = 5 | |
def recognize_page_image(image): | |
start = time.time() | |
result = [[0,"Sample 1"],[1,"Sample 2"]] | |
result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0) | |
result = get_paragraph(result) | |
end = time.time() | |
return result,(end-start) | |
def process_pdf(uploaded_file): | |
pdf_document = fitz.open(temp_pdf_file) | |
total_pages = len(pdf_document) | |
progress_bar = col2.progress(0, text="Жүктеліп жатыр") | |
button_group = col2.container() | |
# clear the container | |
button_group.empty() | |
pages = range(min(max_page,total_pages)) | |
tabs = col1.tabs([f"Бет {page+1}" for page in pages]) | |
pages_result = [] | |
for count, page_num in enumerate(range(min(total_pages,max_page))): | |
page = pdf_document.load_page(page_num) | |
image_matrix = fitz.Matrix(fitz.Identity) | |
pixmap = page.get_pixmap(matrix=image_matrix, dpi=300) | |
image_data = pixmap.samples # This is a bytes object | |
image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data) | |
imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10))) | |
tabs[count].image(imageSmaller) | |
#buffered = BytesIO() | |
#imageSmaller.save(buffered,format="JPEG") | |
#col1.write(f'<h2>Бет {page_num + 1}/{total_pages}</h2>',unsafe_allow_html=True) | |
#col1.write(f'<img src="data:image/png;base64, {base64.b64encode(buffered.getvalue()).decode("utf-8")}"/>',unsafe_allow_html=True) | |
#col1.subheader(f'Бет {page_num + 1}/{total_pages}') | |
#col1.image(imageSmaller, caption=f'Бет {page_num + 1}') | |
result,time_elapsed = recognize_page_image(image) | |
expander = col2.expander(f'{result[0][1][:100]} ... **:orange[{time_elapsed:.3f} секундта таңылды]**') | |
expander.write(f'{result[0][1]}') | |
result_text = line_separator.join([item[1] for item in result]) | |
pages_result.append(result_text) | |
#col2.markdown(result_text) | |
progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}') | |
button_group_html = generateButtonGroupForPDF(pages_result) | |
button_group.write(button_group_html,unsafe_allow_html=True) | |
#col1.write("</div>",unsafe_allow_html=True) | |
progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді') | |
class TextBox: | |
def __init__(self, text, coordinates): | |
# order: topLeft, bottomLeft, bottomRight, topRight | |
x_coords = [int(coord[0]) for coord in coordinates] | |
y_coords = [int(coord[1]) for coord in coordinates] | |
self.text = text | |
self.min_x = min(x_coords) | |
self.max_x = max(x_coords) | |
self.min_y = min(y_coords) | |
self.max_y = max(y_coords) | |
self.height = self.max_y - self.min_y | |
self.center_y = 0.5 * (self.min_y + self.max_y) | |
self.group_id = 0 # Initially ungrouped | |
def __repr__(self): | |
return f"TextBox(text={self.text}, group_id={self.group_id})" | |
def get_paragraph(ocr_results, horizontal_threshold=1, vertical_threshold=0.0, reading_mode='ltr'): | |
# Convert raw OCR results into TextBox objects | |
text_boxes = [TextBox(box[1], box[0]) for box in ocr_results] | |
# Group the boxes into paragraphs | |
current_group_id = 1 | |
while any(box.group_id == 0 for box in text_boxes): # While there are ungrouped boxes | |
ungrouped_boxes = [box for box in text_boxes if box.group_id == 0] | |
# Start a new group if none exists for the current group_id | |
if all(box.group_id != current_group_id for box in text_boxes): | |
ungrouped_boxes[0].group_id = current_group_id # Assign the first ungrouped box to the new group | |
else: | |
# Try to add boxes to the current group | |
current_group_boxes = [box for box in text_boxes if box.group_id == current_group_id] | |
average_height = np.mean([box.height for box in current_group_boxes]) | |
added_to_group = False | |
for group_box in current_group_boxes: | |
min_group_x = group_box.min_x - horizontal_threshold * average_height | |
max_group_x = group_box.max_x + horizontal_threshold * average_height | |
min_group_y = group_box.min_y - vertical_threshold * average_height | |
max_group_y = group_box.max_y + vertical_threshold * average_height | |
for ungrouped_box in ungrouped_boxes: | |
horizontally_aligned = (min_group_x <= ungrouped_box.min_x <= max_group_x) or (min_group_x <= ungrouped_box.max_x <= max_group_x) | |
vertically_aligned = (min_group_y <= ungrouped_box.center_y <= max_group_y) | |
if horizontally_aligned and vertically_aligned: | |
ungrouped_box.group_id = current_group_id | |
added_to_group = True | |
break | |
# If no box was added to the current group, move to the next group | |
if not added_to_group: | |
current_group_id += 1 | |
# Arrange the text order within each group to form paragraphs | |
paragraphs = [] | |
for group_id in set(box.group_id for box in text_boxes): | |
boxes_in_group = [box for box in text_boxes if box.group_id == group_id] | |
average_height = np.mean([box.height for box in boxes_in_group]) | |
min_group_x = min([box.min_x for box in boxes_in_group]) | |
max_group_x = max([box.max_x for box in boxes_in_group]) | |
min_group_y = min([box.min_y for box in boxes_in_group]) | |
max_group_y = max([box.max_y for box in boxes_in_group]) | |
paragraph_text = '' | |
while boxes_in_group: | |
highest_y = min([box.center_y for box in boxes_in_group]) | |
line_candidates = [box for box in boxes_in_group if box.center_y < highest_y + 0.4 * average_height] | |
# Determine the left-most or right-most box based on reading mode | |
if reading_mode == 'ltr': | |
left_most_x = min([box.min_x for box in line_candidates]) | |
for box in line_candidates: | |
if box.min_x == left_most_x: | |
selected_box = box | |
elif reading_mode == 'rtl': | |
right_most_x = max([box.max_x for box in line_candidates]) | |
for box in line_candidates: | |
if box.max_x == right_most_x: | |
selected_box = box | |
paragraph_text += ' ' + selected_box.text | |
boxes_in_group.remove(selected_box) | |
# Append the bounding box and text for the paragraph | |
paragraphs.append([[[min_group_x, min_group_y], [max_group_x, min_group_y], [max_group_x, max_group_y], [min_group_x, max_group_y]], paragraph_text.strip()]) | |
return paragraphs | |
if uploaded_file is not None: | |
if uploaded_file.type == "application/pdf": | |
placeholder = col2.empty() | |
with placeholder, st.spinner('PDF өңделуде ...'): | |
temp_pdf_file = "./temp_pdf_file.pdf" | |
with open(temp_pdf_file, "wb") as f: | |
f.write(uploaded_file.read()) | |
process_pdf(uploaded_file) | |
else: | |
placeholder = col2.empty() | |
with placeholder,st.spinner('Сурет өңделуде ...'): | |
image = Image.open(uploaded_file) | |
#with open(os.path.join("tempDir",image_file)) | |
col1.image(image) | |
result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0) | |
result = get_paragraph(result) | |
result_text = line_separator.join([item[1] for item in result]) | |
button_group_html = generateButtonGroup(result) | |
col2.write(button_group_html, unsafe_allow_html=True) | |
col2.markdown(result_text) |