File size: 4,522 Bytes
7b79b85 b42d6ec 9ac2440 e4387b3 b42d6ec 444b42f b42d6ec 4d57e5c 4bf21c7 4d57e5c b42d6ec 4d57e5c b42d6ec eea98e7 b42d6ec 4d57e5c eb822d4 4d57e5c b42d6ec 4d57e5c 07e2341 4d57e5c 7b79b85 4d57e5c 37bfbd1 7b79b85 4d57e5c 7b79b85 c8197d8 4d57e5c 4bf21c7 9ac2440 4d57e5c 9ac2440 4d57e5c 9ac2440 1346633 4d57e5c 9ac2440 bc9779f 7b79b85 4d57e5c 9ac2440 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import os
# Upload credential json file from default compute service account
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io
import gradio as gr
# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "de954414712822b3"
def translate_text(text, target_language="en"):
translate_client = translate.Client()
result = translate_client.translate(text, target_language=target_language)
return result["translatedText"]
def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
with open(file_path, "rb") as file_stream:
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
name = client.processor_path(project_id, location, processor_id)
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
result = client.process_document(request=request)
extracted_text = result.document.text
translated_text = translate_text(extracted_text)
return extracted_text, translated_text
def unzip_and_find_jpgs(file_path):
extract_path = "extracted_files"
if os.path.exists(extract_path):
# Remove the directory and its contents to start fresh
for root, dirs, files in os.walk(extract_path, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
os.rmdir(extract_path)
os.makedirs(extract_path, exist_ok=True)
jpg_files = []
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
for root, dirs, files in os.walk(extract_path):
if '__MACOSX' in root:
continue
for file in files:
if file.lower().endswith('.jpg'):
full_path = os.path.join(root, file)
jpg_files.append(full_path)
return jpg_files
def process_images(uploaded_file):
# Reinitialize the DataFrame each time this function is called
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
file_path = uploaded_file.name # Gradio provides the file path through the .name attribute
try:
image_files = unzip_and_find_jpgs(file_path)
if not image_files:
return "No JPG files found in the zip."
for file_path in image_files:
extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
new_row = pd.DataFrame([{
"Filename": os.path.basename(file_path),
"Extracted Text": extracted_text,
"Translated Text": translated_text
}])
results_df = pd.concat([results_df, new_row], ignore_index=True)
except Exception as e:
return f"An error occurred: {str(e)}"
html_output = results_df.to_html()
# Save DataFrame to a temporary CSV file for download
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") # Create a temp file
results_df.to_csv(temp_file.name, index=False) # Save DataFrame to CSV
# Read the contents back to verify
temp_file.seek(0) # Move file pointer to the beginning of the file
print(pd.read_csv(temp_file.name)) # Read and print the CSV file to verify its contents
temp_file.close() # Close the file
# Return HTML and the path to the CSV file
return html_output, temp_file.name
with gr.Blocks() as interface:
with gr.Row():
gr.Markdown("# Document AI Translation")
gr.Markdown("Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image.")
with gr.Row():
file_input = gr.File(label="Upload ZIP File")
with gr.Row():
html_output = gr.HTML()
with gr.Row():
file_output = gr.File()
file_input.change(process_images, inputs=file_input, outputs=[html_output, file_output])
if __name__ == "__main__":
interface.launch(debug=True) |