mkaramb commited on
Commit
b42d6ec
·
verified ·
1 Parent(s): 32aaeaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -44
app.py CHANGED
@@ -1,60 +1,46 @@
1
- import pandas as pd
2
  import os
3
- import json
4
- import zipfile
5
- import io
6
- import gradio as gr
7
- import logging
8
- from google.oauth2 import service_account
9
  from google.api_core.client_options import ClientOptions
10
  from google.cloud import documentai_v1 as documentai
11
  from google.cloud.documentai_v1.types import RawDocument
12
  from google.cloud import translate_v2 as translate
13
- from gradio import Interface
14
-
15
- # Setup logging
16
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
-
18
- # Load credentials from environment variable
19
- credentials_raw = os.environ.get("google_authentication")
20
- if not credentials_raw:
21
- raise EnvironmentError("Google Cloud credentials not found in environment.")
22
- credentials_json = json.loads(credentials_raw)
23
- credentials = service_account.Credentials.from_service_account_info(credentials_json)
24
- logging.info("Loaded Google Cloud credentials successfully.")
25
 
26
  # Global DataFrame declaration
27
  results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
28
 
29
- # Google Cloud Document AI processor details
30
  project_id = "herbaria-ai"
31
  location = "us"
32
  processor_id = "4307b078717a399a"
33
 
34
  def translate_text(text, target_language="en"):
35
- translate_client = translate.Client(credentials=credentials)
36
  result = translate_client.translate(text, target_language=target_language)
37
  return result["translatedText"]
38
 
39
  def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
40
- logging.info(f"Processing document {file_path}.")
41
- opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com", credentials=credentials)
42
  client = documentai.DocumentProcessorServiceClient(client_options=opts)
43
-
44
  with open(file_path, "rb") as file_stream:
45
  raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
46
 
47
  name = client.processor_path(project_id, location, processor_id)
48
  request = documentai.ProcessRequest(name=name, raw_document=raw_document)
49
  result = client.process_document(request=request)
50
-
51
  extracted_text = result.document.text
52
  translated_text = translate_text(extracted_text)
53
- logging.info(f"Document processed and translated for {file_path}.")
54
  return extracted_text, translated_text
55
 
56
  def unzip_and_find_jpgs(file_path):
57
- logging.info(f"Unzipping file {file_path}.")
58
  extract_path = "extracted_files"
59
  os.makedirs(extract_path, exist_ok=True)
60
  jpg_files = []
@@ -67,25 +53,21 @@ def unzip_and_find_jpgs(file_path):
67
  if file.lower().endswith('.jpg'):
68
  full_path = os.path.join(root, file)
69
  jpg_files.append(full_path)
70
- logging.info(f"Found {len(jpg_files)} JPG files in {file_path}.")
71
  return jpg_files
72
 
73
  def process_images(uploaded_file):
74
- logging.info("Started processing the uploaded file.") # Check if the function is triggered
75
  global results_df
76
- results_df = results_df.iloc[0:0] # Clear the DataFrame
77
- file_path = uploaded_file.name # Gradio provides the file path
78
- logging.info(f"Received file {file_path} for processing.")
79
 
80
  try:
81
  image_files = unzip_and_find_jpgs(file_path)
82
-
83
  if not image_files:
84
- logging.warning("No JPG files found in the zip.")
85
  return "No JPG files found in the zip."
86
 
87
  for file_path in image_files:
88
- logging.info(f"Processing image file {file_path}.")
89
  extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
90
  new_row = pd.DataFrame([{
91
  "Filename": os.path.basename(file_path),
@@ -93,23 +75,18 @@ def process_images(uploaded_file):
93
  "Translated Text": translated_text
94
  }])
95
  results_df = pd.concat([results_df, new_row], ignore_index=True)
96
- logging.info(f"Data added for file {file_path}.")
97
  except Exception as e:
98
- logging.error(f"An error occurred: {str(e)}")
99
  return f"An error occurred: {str(e)}"
100
 
101
- logging.info("Processing complete. Generating HTML output.")
102
  return results_df.to_html()
103
 
104
- # Set up the interface
105
- interface = Interface(
106
  fn=process_images,
107
- inputs=gr.components.File(label="Upload ZIP File", type="file"),
108
  outputs="html",
109
  title="Document AI Translation",
110
- description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image.",
111
- debug=True
112
  )
113
 
114
  if __name__ == "__main__":
115
- interface.launch(debug=True)
 
 
1
  import os
2
+ # Upload credential json file from default compute service account
3
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
4
+
5
+ import pandas as pd
 
 
6
  from google.api_core.client_options import ClientOptions
7
  from google.cloud import documentai_v1 as documentai
8
  from google.cloud.documentai_v1.types import RawDocument
9
  from google.cloud import translate_v2 as translate
10
+ import zipfile
11
+ import os
12
+ import io
13
+ import gradio as gr
 
 
 
 
 
 
 
 
14
 
15
  # Global DataFrame declaration
16
  results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
17
 
18
+ # Set your Google Cloud Document AI processor details here
19
  project_id = "herbaria-ai"
20
  location = "us"
21
  processor_id = "4307b078717a399a"
22
 
23
  def translate_text(text, target_language="en"):
24
+ translate_client = translate.Client()
25
  result = translate_client.translate(text, target_language=target_language)
26
  return result["translatedText"]
27
 
28
  def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
29
+ opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
 
30
  client = documentai.DocumentProcessorServiceClient(client_options=opts)
31
+
32
  with open(file_path, "rb") as file_stream:
33
  raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
34
 
35
  name = client.processor_path(project_id, location, processor_id)
36
  request = documentai.ProcessRequest(name=name, raw_document=raw_document)
37
  result = client.process_document(request=request)
38
+
39
  extracted_text = result.document.text
40
  translated_text = translate_text(extracted_text)
 
41
  return extracted_text, translated_text
42
 
43
  def unzip_and_find_jpgs(file_path):
 
44
  extract_path = "extracted_files"
45
  os.makedirs(extract_path, exist_ok=True)
46
  jpg_files = []
 
53
  if file.lower().endswith('.jpg'):
54
  full_path = os.path.join(root, file)
55
  jpg_files.append(full_path)
 
56
  return jpg_files
57
 
58
  def process_images(uploaded_file):
 
59
  global results_df
60
+ results_df = results_df.iloc[0:0] # Clear the DataFrame if re-running this cell
61
+
62
+ file_path = uploaded_file.name # Gradio provides the file path through the .name attribute
63
 
64
  try:
65
  image_files = unzip_and_find_jpgs(file_path)
66
+
67
  if not image_files:
 
68
  return "No JPG files found in the zip."
69
 
70
  for file_path in image_files:
 
71
  extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
72
  new_row = pd.DataFrame([{
73
  "Filename": os.path.basename(file_path),
 
75
  "Translated Text": translated_text
76
  }])
77
  results_df = pd.concat([results_df, new_row], ignore_index=True)
 
78
  except Exception as e:
 
79
  return f"An error occurred: {str(e)}"
80
 
 
81
  return results_df.to_html()
82
 
83
+ interface = gr.Interface(
 
84
  fn=process_images,
85
+ inputs="file",
86
  outputs="html",
87
  title="Document AI Translation",
88
+ description="Upload a ZIP file containing JPEG/JPG images, and the system will extract and translate text from each image."
 
89
  )
90
 
91
  if __name__ == "__main__":
92
+ interface.launch(debug=True)