Spaces:

BMukhtar
/

BookRecognitionKz

Sleeping

App Files Files Community

BookRecognitionKz / app.py

BMukhtar

enable GPU

aebb77d 4 months ago

raw

history blame contribute delete

11.5 kB

	import streamlit as st
	from PIL import Image
	import os
	import easyocr
	import numpy as np
	import fitz # PyMuPDF
	import io
	from pdf2image import convert_from_bytes
	#from st_btn_group import st_btn_group
	#from streamlit_option_menu import option_menu
	import docx
	from io import BytesIO
	#import streamlit.components.v1 as components
	import base64

	line_separator = "\n\n"

	#def downloadTxt():
	def generateTxtLink(result):
	result_txt = ""
	print(result)
	for para in result:
	result_txt += para[1]+"\n"
	result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
	result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
	return result_txt_link

	def generateMultiPageTxtLink(result):
	result_txt = ""
	print(result)
	for para in result:
	result_txt += para+"\n"
	result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8')
	result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>"
	return result_txt_link

	def generateDocLink(result):
	doc = docx.Document()
	for para in result:
	doc.add_paragraph(para[1])

	target_stream = BytesIO()
	result_doc = doc.save(target_stream)
	base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
	stlyeCss = ""
	doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
	return doc_link

	def generateMultiPageDocLink(pages_result):
	doc = docx.Document()
	#print(pages_result)
	for page in pages_result:
	page_split = page.split("\n")
	for para in page_split:
	doc.add_paragraph(para)
	doc.add_page_break()
	target_stream = BytesIO()
	result_doc = doc.save(target_stream)
	base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8')
	doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>"
	return doc_link

	def generateButtonGroup(result):
	txtLink = generateTxtLink(result)
	docLink = generateDocLink(result)
	return txtLink+"\n"+docLink

	def generateButtonGroupForPDF(pages_result):
	txtLink = generateMultiPageTxtLink(pages_result)
	docLink = generateMultiPageDocLink(pages_result)
	return txtLink+"\n"+docLink

	def local_css(file_name):
	with open(file_name) as f:
	st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)


	models_dir = "./models"
	output_dir = "./output"
	dirs = [models_dir, output_dir]
	for d in dirs:
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	font_path = models_dir + "/Ubuntu-Regular.ttf"
	reader = easyocr.Reader(
	['en'],
	gpu=True,
	recog_network='best_norm_ED',
	detect_network="craft",
	user_network_directory=models_dir,
	model_storage_directory=models_dir,
	) # this needs to run only once to load the model into memory




	# main title
	st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану")
	local_css("app.css")
	#st.markdown("<a class='button' href='lenta.ru'>DOCX жүктеп ал</a>",unsafe_allow_html=True)
	st.title("Сурет немесе пдф файлдан текст алу")
	# subtitle
	#st.markdown("## Qazaq OCR")

	uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf'])

	col1, col2 = st.columns(2)


	import time

	max_page = 5
	def recognize_page_image(image):
	start = time.time()
	result = [[0,"Sample 1"],[1,"Sample 2"]]
	result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0)
	result = get_paragraph(result)
	end = time.time()
	return result,(end-start)


	def process_pdf(uploaded_file):
	pdf_document = fitz.open(temp_pdf_file)
	total_pages = len(pdf_document)
	progress_bar = col2.progress(0, text="Жүктеліп жатыр")
	button_group = col2.container()
	# clear the container
	button_group.empty()
	pages = range(min(max_page,total_pages))
	tabs = col1.tabs([f"Бет {page+1}" for page in pages])
	pages_result = []
	for count, page_num in enumerate(range(min(total_pages,max_page))):
	page = pdf_document.load_page(page_num)
	image_matrix = fitz.Matrix(fitz.Identity)
	pixmap = page.get_pixmap(matrix=image_matrix, dpi=300)
	image_data = pixmap.samples # This is a bytes object
	image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data)
	imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10)))
	tabs[count].image(imageSmaller)
	#buffered = BytesIO()
	#imageSmaller.save(buffered,format="JPEG")
	#col1.write(f'<h2>Бет {page_num + 1}/{total_pages}</h2>',unsafe_allow_html=True)
	#col1.write(f'<img src="data:image/png;base64, {base64.b64encode(buffered.getvalue()).decode("utf-8")}"/>',unsafe_allow_html=True)
	#col1.subheader(f'Бет {page_num + 1}/{total_pages}')
	#col1.image(imageSmaller, caption=f'Бет {page_num + 1}')
	result,time_elapsed = recognize_page_image(image)
	expander = col2.expander(f'{result[0][1][:100]} ... :orange[{time_elapsed:.3f} секундта таңылды]')
	expander.write(f'{result[0][1]}')
	result_text = line_separator.join([item[1] for item in result])
	pages_result.append(result_text)
	#col2.markdown(result_text)
	progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}')

	button_group_html = generateButtonGroupForPDF(pages_result)
	button_group.write(button_group_html,unsafe_allow_html=True)
	#col1.write("</div>",unsafe_allow_html=True)
	progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді')

	class TextBox:
	def __init__(self, text, coordinates):
	# order: topLeft, bottomLeft, bottomRight, topRight
	x_coords = [int(coord[0]) for coord in coordinates]
	y_coords = [int(coord[1]) for coord in coordinates]

	self.text = text
	self.min_x = min(x_coords)
	self.max_x = max(x_coords)
	self.min_y = min(y_coords)
	self.max_y = max(y_coords)
	self.height = self.max_y - self.min_y
	self.center_y = 0.5 * (self.min_y + self.max_y)
	self.group_id = 0 # Initially ungrouped

	def __repr__(self):
	return f"TextBox(text={self.text}, group_id={self.group_id})"


	def get_paragraph(ocr_results, horizontal_threshold=1, vertical_threshold=0.0, reading_mode='ltr'):
	# Convert raw OCR results into TextBox objects
	text_boxes = [TextBox(box[1], box[0]) for box in ocr_results]

	# Group the boxes into paragraphs
	current_group_id = 1
	while any(box.group_id == 0 for box in text_boxes): # While there are ungrouped boxes
	ungrouped_boxes = [box for box in text_boxes if box.group_id == 0]

	# Start a new group if none exists for the current group_id
	if all(box.group_id != current_group_id for box in text_boxes):
	ungrouped_boxes[0].group_id = current_group_id # Assign the first ungrouped box to the new group
	else:
	# Try to add boxes to the current group
	current_group_boxes = [box for box in text_boxes if box.group_id == current_group_id]
	average_height = np.mean([box.height for box in current_group_boxes])
	added_to_group = False

	for group_box in current_group_boxes:
	min_group_x = group_box.min_x - horizontal_threshold * average_height
	max_group_x = group_box.max_x + horizontal_threshold * average_height
	min_group_y = group_box.min_y - vertical_threshold * average_height
	max_group_y = group_box.max_y + vertical_threshold * average_height

	for ungrouped_box in ungrouped_boxes:
	horizontally_aligned = (min_group_x <= ungrouped_box.min_x <= max_group_x) or (min_group_x <= ungrouped_box.max_x <= max_group_x)
	vertically_aligned = (min_group_y <= ungrouped_box.center_y <= max_group_y)

	if horizontally_aligned and vertically_aligned:
	ungrouped_box.group_id = current_group_id
	added_to_group = True
	break

	# If no box was added to the current group, move to the next group
	if not added_to_group:
	current_group_id += 1

	# Arrange the text order within each group to form paragraphs
	paragraphs = []
	for group_id in set(box.group_id for box in text_boxes):
	boxes_in_group = [box for box in text_boxes if box.group_id == group_id]
	average_height = np.mean([box.height for box in boxes_in_group])
	min_group_x = min([box.min_x for box in boxes_in_group])
	max_group_x = max([box.max_x for box in boxes_in_group])
	min_group_y = min([box.min_y for box in boxes_in_group])
	max_group_y = max([box.max_y for box in boxes_in_group])

	paragraph_text = ''
	while boxes_in_group:
	highest_y = min([box.center_y for box in boxes_in_group])
	line_candidates = [box for box in boxes_in_group if box.center_y < highest_y + 0.4 * average_height]

	# Determine the left-most or right-most box based on reading mode
	if reading_mode == 'ltr':
	left_most_x = min([box.min_x for box in line_candidates])
	for box in line_candidates:
	if box.min_x == left_most_x:
	selected_box = box
	elif reading_mode == 'rtl':
	right_most_x = max([box.max_x for box in line_candidates])
	for box in line_candidates:
	if box.max_x == right_most_x:
	selected_box = box

	paragraph_text += ' ' + selected_box.text
	boxes_in_group.remove(selected_box)

	# Append the bounding box and text for the paragraph
	paragraphs.append([[[min_group_x, min_group_y], [max_group_x, min_group_y], [max_group_x, max_group_y], [min_group_x, max_group_y]], paragraph_text.strip()])

	return paragraphs


	if uploaded_file is not None:
	if uploaded_file.type == "application/pdf":
	placeholder = col2.empty()
	with placeholder, st.spinner('PDF өңделуде ...'):
	temp_pdf_file = "./temp_pdf_file.pdf"
	with open(temp_pdf_file, "wb") as f:
	f.write(uploaded_file.read())
	process_pdf(uploaded_file)
	else:
	placeholder = col2.empty()
	with placeholder,st.spinner('Сурет өңделуде ...'):
	image = Image.open(uploaded_file)
	#with open(os.path.join("tempDir",image_file))
	col1.image(image)
	result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0)
	result = get_paragraph(result)
	result_text = line_separator.join([item[1] for item in result])
	button_group_html = generateButtonGroup(result)
	col2.write(button_group_html, unsafe_allow_html=True)
	col2.markdown(result_text)