Spaces:

Edgar404
/

Candy_Prototype

Sleeping

App Files Files Community

Candy_Prototype / app.py

Edgar404

Update app.py

f93c354 verified 5 months ago

raw

history blame

3.44 kB

	# -- coding: utf-8 --
	"""Demo.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1Icb8zeoaudyTDOKM1QySNay1cXzltRAp
	"""

	import gradio as gr
	from PIL import Image
	import re

	import torch
	import torch.nn as nn
	from warnings import simplefilter

	simplefilter('ignore')
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# Seting up the model
	from transformers import DonutProcessor, VisionEncoderDecoderModel

	print('Loading the base model ....')
	base_model = VisionEncoderDecoderModel.from_pretrained('Edgar404/donut-shivi-recognition')
	base_processor = DonutProcessor.from_pretrained('Edgar404/donut-shivi-recognition')
	print('Loading complete')

	print('Loading the optimized model ....')
	optimized_model = VisionEncoderDecoderModel.from_pretrained('Edgar404/donut-shivi-cheques_KD_320', torch_dtype = torch.bfloat16 )
	optimized_processor = DonutProcessor.from_pretrained('Edgar404/donut-shivi-cheques_KD_320')
	print('Loading complete')

	# setting


	def process_image(image , mode = 'optimized' ):
	""" Function that takes an image and perform an OCR using the model DonUT via the task document
	parsing

	parameters
	__________
	image : a machine readable image of class PIL or numpy"""

	model = optimized_model if mode == 'optimized' else base_model
	processor = optimized_processor if mode == 'optimized' else base_processor
	d_type = torch.bfloat16 if ((mode == 'optimized') & (device =='cuda')) else torch.float32

	model.to(device)
	model.eval()


	task_prompt = "<s_cord-v2>"
	decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

	pixel_values = processor(image, return_tensors="pt").pixel_values

	outputs = model.generate(
	pixel_values.to(device , dtype = d_type),
	decoder_input_ids=decoder_input_ids.to(device),
	max_length=model.decoder.config.max_position_embeddings,
	pad_token_id=processor.tokenizer.pad_token_id,
	eos_token_id=processor.tokenizer.eos_token_id,
	use_cache=True,
	bad_words_ids=[[processor.tokenizer.unk_token_id]],
	return_dict_in_generate=True,
	)

	sequence = processor.batch_decode(outputs.sequences)[0]
	sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
	sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()
	output = processor.token2json(sequence)

	return output


	def image_classifier(image , mode):
	return process_image(image , mode)



	examples_list = [['./test_images/test_0.jpg' ,"base"] ,
	['./test_images/test_1.jpg','base'],
	['./test_images/test_2.jpg' ,"base"],
	['./test_images/test_3.jpg','base'],
	['./test_images/test_4.jpg','base'],
	['./test_images/test_5.jpg' ,"base"],
	['./test_images/test_6.jpg' ,"base"],
	['./test_images/test_7.jpg','base'],
	['./test_images/test_8.jpg','base'],
	['./test_images/test_9.jpg','base']
	]

	demo = gr.Interface(fn=image_classifier, inputs=["image",
	gr.Radio(["base" , "optimized"], label="mode")],
	outputs="text",
	examples = examples_list )

	demo.launch(share = True , debug = True)