File size: 4,101 Bytes
4d57e5c
 
 
 
 
18cb325
7b79b85
4d57e5c
 
2f8f2f2
 
4d57e5c
444b42f
2f8f2f2
 
 
 
 
 
 
 
 
444b42f
4d57e5c
 
 
 
 
 
 
 
 
2f8f2f2
4d57e5c
 
 
 
2f8f2f2
 
 
 
 
 
4d57e5c
 
 
eb822d4
4d57e5c
 
 
 
 
 
 
 
 
 
 
 
 
 
7b79b85
4d57e5c
37bfbd1
7b79b85
4d57e5c
7b79b85
c8197d8
4d57e5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b79b85
4d57e5c
 
eb822d4
5884368
 
a22bf0a
18cb325
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
from google.cloud import translate_v2 as translate
import zipfile
import os
import io
import gradio as gr
import json
from google.oauth2 import service_account

# Upload credential json file from default compute service account
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

credentials_raw = os.environ.get("google_authentication")

# Parse the JSON credentials
credentials_json = json.loads(credentials_raw)

# Use the parsed credentials to create a service account
credentials = service_account.Credentials.from_service_account_info(credentials_json)

# Global DataFrame declaration
results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])

# Set your Google Cloud Document AI processor details here
project_id = "herbaria-ai"
location = "us"
processor_id = "4307b078717a399a"

def translate_text(text, target_language="en"):
    translate_client = translate.Client(credentials=credenentials)
    result = translate_client.translate(text, target_language=target_language)
    return result["translatedText"]

def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
    #opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    # Initialize Document AI client with credentials
    client_options = {'credentials': credentials}
    
    client = documentai.DocumentProcessorServiceClient(client_options=client_options)
    #client = documentai.DocumentProcessorServiceClient(client_options=opts)

    with open(file_path, "rb") as file_stream:
        raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

    name = client.processor_path(project_id, location, processor_id)
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    result = client.process_document(request=request)

    extracted_text = result.document.text
    translated_text = translate_text(extracted_text)
    return extracted_text, translated_text

def unzip_and_find_jpgs(file_path):
    extract_path = "extracted_files"
    os.makedirs(extract_path, exist_ok=True)
    jpg_files = []
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        for root, dirs, files in os.walk(extract_path):
            if '__MACOSX' in root:
                continue
            for file in files:
                if file.lower().endswith('.jpg'):
                    full_path = os.path.join(root, file)
                    jpg_files.append(full_path)
    return jpg_files

def process_images(uploaded_file):
    global results_df
    results_df = results_df.iloc[0:0]  # Clear the DataFrame if re-running this cell
    
    file_path = uploaded_file.name  # Gradio provides the file path through the .name attribute

    try:
        image_files = unzip_and_find_jpgs(file_path)
        
        if not image_files:
            return "No JPG files found in the zip."

        for file_path in image_files:
            extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
            new_row = pd.DataFrame([{
                "Filename": os.path.basename(file_path),
                "Extracted Text": extracted_text,
                "Translated Text": translated_text
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)
    except Exception as e:
        return f"An error occurred: {str(e)}"

    return results_df.to_html()

interface = gr.Interface(
    fn=process_images,
    inputs="file",
    outputs="html",
    title="Document AI Translation",
    description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image."
)

if __name__ == "__main__":
    interface.launch(debug=True)




# def greet(name):
   # return "Hello " + name + "!!"

#iface = gr.Interface(fn=greet, inputs="text", outputs="text")
#iface.launch()