File size: 4,522 Bytes
7b79b85
b42d6ec
 
 
9ac2440
e4387b3
 
 
 
b42d6ec
 
 
 
444b42f
b42d6ec
4d57e5c
 
4bf21c7
4d57e5c
 
b42d6ec
4d57e5c
 
 
 
b42d6ec
eea98e7
b42d6ec
4d57e5c
 
eb822d4
4d57e5c
 
 
b42d6ec
4d57e5c
 
 
 
 
 
07e2341
 
 
 
 
 
 
 
 
4d57e5c
 
 
 
7b79b85
4d57e5c
37bfbd1
7b79b85
4d57e5c
7b79b85
c8197d8
4d57e5c
 
 
4bf21c7
 
 
9ac2440
 
4d57e5c
 
9ac2440
4d57e5c
9ac2440
1346633
4d57e5c
 
 
 
 
 
 
 
 
9ac2440
 
bc9779f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b79b85
4d57e5c
9ac2440
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
# Upload credential json file from default compute service account
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io
import gradio as gr

# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "de954414712822b3"

def translate_text(text, target_language="en"):
    translate_client = translate.Client()
    result = translate_client.translate(text, target_language=target_language)
    return result["translatedText"]

def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    with open(file_path, "rb") as file_stream:
        raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

    name = client.processor_path(project_id, location, processor_id)
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    result = client.process_document(request=request)

    extracted_text = result.document.text
    translated_text = translate_text(extracted_text)
    return extracted_text, translated_text

def unzip_and_find_jpgs(file_path):
    extract_path = "extracted_files"
    if os.path.exists(extract_path):
        # Remove the directory and its contents to start fresh
        for root, dirs, files in os.walk(extract_path, topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        os.rmdir(extract_path)

    os.makedirs(extract_path, exist_ok=True)
    jpg_files = []
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        for root, dirs, files in os.walk(extract_path):
            if '__MACOSX' in root:
                continue
            for file in files:
                if file.lower().endswith('.jpg'):
                    full_path = os.path.join(root, file)
                    jpg_files.append(full_path)
    return jpg_files

def process_images(uploaded_file):
    # Reinitialize the DataFrame each time this function is called
    results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

    file_path = uploaded_file.name  # Gradio provides the file path through the .name attribute

    try:
        image_files = unzip_and_find_jpgs(file_path)

        if not image_files:
            return "No JPG files found in the zip."

        for file_path in image_files:
            extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
            new_row = pd.DataFrame([{
                "Filename": os.path.basename(file_path),
                "Extracted Text": extracted_text,
                "Translated Text": translated_text
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        return f"An error occurred: {str(e)}"

    html_output = results_df.to_html()

    # Save DataFrame to a temporary CSV file for download
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")  # Create a temp file
    results_df.to_csv(temp_file.name, index=False)  # Save DataFrame to CSV
    
    # Read the contents back to verify
    temp_file.seek(0)  # Move file pointer to the beginning of the file
    print(pd.read_csv(temp_file.name))  # Read and print the CSV file to verify its contents

    temp_file.close()  # Close the file

    # Return HTML and the path to the CSV file
    return html_output, temp_file.name

with gr.Blocks() as interface:
    with gr.Row():
        gr.Markdown("# Document AI Translation")
        gr.Markdown("Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image.")
    with gr.Row():
        file_input = gr.File(label="Upload ZIP File")
    with gr.Row():
        html_output = gr.HTML()
    with gr.Row():
        file_output = gr.File()

    file_input.change(process_images, inputs=file_input, outputs=[html_output, file_output])

if __name__ == "__main__":
    interface.launch(debug=True)