File size: 4,137 Bytes
7b79b85
b42d6ec
 
 
e4387b3
 
 
 
b42d6ec
 
 
 
60d2516
c8e2f83
444b42f
4d57e5c
 
 
b42d6ec
4d57e5c
 
a05285c
4d57e5c
 
b42d6ec
4d57e5c
 
 
 
b42d6ec
eea98e7
b42d6ec
4d57e5c
 
eb822d4
4d57e5c
 
 
b42d6ec
4d57e5c
 
 
 
 
 
 
 
 
 
7b79b85
4d57e5c
37bfbd1
7b79b85
4d57e5c
7b79b85
c8197d8
4d57e5c
 
 
 
88a3c99
1346633
88a3c99
3b55d2c
1346633
 
3b55d2c
70266cf
4d57e5c
 
 
1346633
 
4d57e5c
 
 
 
 
 
 
 
 
1346633
 
2a9e87f
70266cf
88a3c99
70266cf
 
58e491e
db0a2cb
330c121
 
 
 
 
 
 
 
 
60d2516
330c121
7b79b85
4d57e5c
60d2516
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
# Upload credential json file from default compute service account
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io
import gradio as gr
import pandas as pd
import tempfile

# Global DataFrame declaration
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "de954414712822b3"

def translate_text(text, target_language="en"):
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language)
    return result["translatedText"]

def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    with open(file_path, "rb") as file_stream:
        raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

    name = client.processor_path(project_id, location, processor_id)
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    result = client.process_document(request=request)

    extracted_text = result.document.text
    translated_text = translate_text(extracted_text)
    return extracted_text, translated_text

def unzip_and_find_jpgs(file_path):
    extract_path = "extracted_files"
    os.makedirs(extract_path, exist_ok=True)
    jpg_files = []
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        for root, dirs, files in os.walk(extract_path):
            if '__MACOSX' in root:
                continue
            for file in files:
                if file.lower().endswith('.jpg'):
                    full_path = os.path.join(root, file)
                    jpg_files.append(full_path)
    return jpg_files

def process_images(uploaded_file):
    global results_df
    if uploaded_file is None:
        results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])  # Clear DataFrame
        return "", ""  # Return empty outputs if no file is uploaded
    else:
        # Reinitialize the DataFrame every time a new file is uploaded
        results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

    file_path = uploaded_file.name
    try:
        image_files = unzip_and_find_jpgs(file_path)
        if not image_files:
            return "No JPG files found in the zip.", ""

        for file_path in image_files:
            extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
            new_row = pd.DataFrame([{
                "Filename": os.path.basename(file_path),
                "Extracted Text": extracted_text,
                "Translated Text": translated_text
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        return f"An error occurred: {str(e)}", ""

    html_output = results_df.to_html()
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    results_df.to_csv(temp_file.name, index=False)
    temp_file.close()  # File is closed but not deleted
    return html_output, temp_file.name

with gr.Blocks() as interface:
    with gr.Row():
        gr.Markdown("# Document AI Translation")
        gr.Markdown("Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image.")
    with gr.Row():
        file_input = gr.File(label="Upload ZIP File")
    with gr.Row():
        html_output = gr.HTML()
    with gr.Row():
        file_output = gr.File()

    file_input.change(process_images, inputs=file_input, outputs=[html_output, file_output])

if __name__ == "__main__":
    interface.launch(debug=True)