Spaces:

spark-ds549
/

Chinese-Label-Transcription

Sleeping

App Files Files Community

Chinese-Label-Transcription / app.py

mkaramb

Update app.py

bc9779f verified 10 months ago

raw

history blame

4.52 kB

	import os
	# Upload credential json file from default compute service account
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

	import pandas as pd
	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai_v1 as documentai
	from google.cloud.documentai_v1.types import RawDocument
	from google.cloud import translate_v2 as translate
	import zipfile
	import os
	import io
	import gradio as gr

	# Set your Google Cloud Document AI processor details here
	project_id = "herbaria-ai"
	location = "us"
	processor_id = "de954414712822b3"

	def translate_text(text, target_language="en"):
	translate_client = translate.Client()
	result = translate_client.translate(text, target_language=target_language)
	return result["translatedText"]

	def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
	opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
	client = documentai.DocumentProcessorServiceClient(client_options=opts)

	with open(file_path, "rb") as file_stream:
	raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

	name = client.processor_path(project_id, location, processor_id)
	request = documentai.ProcessRequest(name=name, raw_document=raw_document)
	result = client.process_document(request=request)

	extracted_text = result.document.text
	translated_text = translate_text(extracted_text)
	return extracted_text, translated_text

	def unzip_and_find_jpgs(file_path):
	extract_path = "extracted_files"
	if os.path.exists(extract_path):
	# Remove the directory and its contents to start fresh
	for root, dirs, files in os.walk(extract_path, topdown=False):
	for name in files:
	os.remove(os.path.join(root, name))
	for name in dirs:
	os.rmdir(os.path.join(root, name))
	os.rmdir(extract_path)

	os.makedirs(extract_path, exist_ok=True)
	jpg_files = []
	with zipfile.ZipFile(file_path, 'r') as zip_ref:
	zip_ref.extractall(extract_path)
	for root, dirs, files in os.walk(extract_path):
	if '__MACOSX' in root:
	continue
	for file in files:
	if file.lower().endswith('.jpg'):
	full_path = os.path.join(root, file)
	jpg_files.append(full_path)
	return jpg_files

	def process_images(uploaded_file):
	# Reinitialize the DataFrame each time this function is called
	results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

	file_path = uploaded_file.name # Gradio provides the file path through the .name attribute

	try:
	image_files = unzip_and_find_jpgs(file_path)

	if not image_files:
	return "No JPG files found in the zip."

	for file_path in image_files:
	extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
	new_row = pd.DataFrame([{
	"Filename": os.path.basename(file_path),
	"Extracted Text": extracted_text,
	"Translated Text": translated_text
	}])
	results_df = pd.concat([results_df, new_row], ignore_index=True)
	except Exception as e:
	return f"An error occurred: {str(e)}"

	html_output = results_df.to_html()

	# Save DataFrame to a temporary CSV file for download
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") # Create a temp file
	results_df.to_csv(temp_file.name, index=False) # Save DataFrame to CSV

	# Read the contents back to verify
	temp_file.seek(0) # Move file pointer to the beginning of the file
	print(pd.read_csv(temp_file.name)) # Read and print the CSV file to verify its contents

	temp_file.close() # Close the file

	# Return HTML and the path to the CSV file
	return html_output, temp_file.name

	with gr.Blocks() as interface:
	with gr.Row():
	gr.Markdown("# Document AI Translation")
	gr.Markdown("Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image.")
	with gr.Row():
	file_input = gr.File(label="Upload ZIP File")
	with gr.Row():
	html_output = gr.HTML()
	with gr.Row():
	file_output = gr.File()

	file_input.change(process_images, inputs=file_input, outputs=[html_output, file_output])

	if __name__ == "__main__":
	interface.launch(debug=True)