mkaramb commited on
Commit
3801b0c
·
verified ·
1 Parent(s): db98c0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -23
app.py CHANGED
@@ -1,28 +1,175 @@
1
  import os
2
- # Upload credential json file from default compute service account
3
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
4
-
5
  import pandas as pd
6
  from google.api_core.client_options import ClientOptions
7
  from google.cloud import documentai_v1 as documentai
8
  from google.cloud.documentai_v1.types import RawDocument
9
- from google.cloud import translate_v2 as translate
10
  import zipfile
11
- import os
12
- import io
13
  import gradio as gr
14
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  # Set your Google Cloud Document AI processor details here
17
  project_id = "herbaria-ai"
18
  location = "us"
19
  processor_id = "de954414712822b3"
20
 
21
- def translate_text(text, target_language="en"):
22
- translate_client = translate.Client()
23
- result = translate_client.translate(text, target_language=target_language)
24
- return result["translatedText"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
27
  opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
28
  client = documentai.DocumentProcessorServiceClient(client_options=opts)
@@ -38,6 +185,7 @@ def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
38
  translated_text = translate_text(extracted_text)
39
  return extracted_text, translated_text
40
 
 
41
  def unzip_and_find_jpgs(file_path):
42
  extract_path = "extracted_files"
43
  if os.path.exists(extract_path):
@@ -62,40 +210,64 @@ def unzip_and_find_jpgs(file_path):
62
  jpg_files.append(full_path)
63
  return jpg_files
64
 
 
 
 
 
65
  def process_images(uploaded_file):
66
- # Reinitialize the DataFrame each time this function is called
67
- results_df = pd.DataFrame(columns=["Filename", "Extracted Text", "Translated Text"])
68
 
69
- file_path = uploaded_file.name # Gradio provides the file path through the .name attribute
 
70
 
71
  try:
72
  image_files = unzip_and_find_jpgs(file_path)
73
 
74
  if not image_files:
75
  return "No JPG files found in the zip."
 
 
76
 
77
  for file_path in image_files:
78
- extracted_text, translated_text = batch_process_documents(file_path, "image/jpeg")
 
79
  new_row = pd.DataFrame([{
80
- "Filename": os.path.basename(file_path),
81
- "Extracted Text": extracted_text,
82
- "Translated Text": translated_text
83
  }])
84
  results_df = pd.concat([results_df, new_row], ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  except Exception as e:
86
- return f"An error occurred: {str(e)}"
87
 
88
  html_output = results_df.to_html()
89
 
90
- # Save DataFrame to a temporary CSV file for download
91
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") # Create a temp file
92
- results_df.to_csv(temp_file.name, index=False) # Save DataFrame to CSV
93
 
94
- temp_file.close() # Close the file
95
 
96
- # Return HTML and the path to the CSV file
97
  return html_output, temp_file.name
98
 
 
 
 
 
99
  with gr.Blocks() as interface:
100
  with gr.Row():
101
  gr.Markdown("# Document AI Translation")
 
1
  import os
 
 
 
2
  import pandas as pd
3
  from google.api_core.client_options import ClientOptions
4
  from google.cloud import documentai_v1 as documentai
5
  from google.cloud.documentai_v1.types import RawDocument
 
6
  import zipfile
 
 
7
  import gradio as gr
8
  import tempfile
9
+ import textwrap
10
+ import json
11
+ import google.generativeai as genai
12
+ from IPython.display import Markdown
13
+ import random
14
+ import re
15
+ from time import sleep
16
+
17
+ # —————————————
18
+ # SETUP
19
+ # —————————————
20
+
21
+ # CREDENTIALS FILE PATH
22
+ os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
23
+
24
+ # GEMINI API KEY
25
+ genai.configure(api_key='AIzaSyB9iHlqAgz5TEF36Kg_fJLJvoIDCJkqwJI')
26
+
27
+ # GEMINI MODEL DECLARATION
28
+ model = genai.GenerativeModel('gemini-1.0-pro')
29
+
30
+ # DOCUMENT AI DETAILS
31
+ project_id = "herbaria-ai"
32
+ location = "us"
33
+ processor_id = "de954414712822b3"
34
 
35
  # Set your Google Cloud Document AI processor details here
36
  project_id = "herbaria-ai"
37
  location = "us"
38
  processor_id = "de954414712822b3"
39
 
40
+ # helper function for processing gemini responses (which are in markdown)
41
+ def to_markdown(text):
42
+ text = text.replace('•', ' *')
43
+ return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
44
+
45
+ # few-shot samples
46
+ shots = \
47
+ {
48
+ "Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 西藏自治区山南市洛扎县拉康镇卡久寺附近 28°5'37.15N, 91°7'24.74″E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6657 14 September 2017 M4 5 6 7 8 NCIL 中国数字植物标本馆 N° 2604988 西藏 TIBET 中国科学院 植物研究所 标本馆 PE CHINESE NATIONAL HERBARIUM (PE) 02334122 #PE6657 ASTERACEAE 菊科 Anaphalis contorta (D. Don) Hook. f. 鉴定人:张国进 Guo-Jin ZHANG 旋叶香青 17 May 2018"
49
+ :{"Collector":"Guo-Jin, Zhang",
50
+ "Location":"Xizang Autonomous Region, Shannan City, Lhozhag County, Lhakang Town, Kharchhu Gompa vincinity, Slopes near roadsides",
51
+ "Taxon":"Asteraceae; Anaphalis contorta (D. Don) Hook. f.",
52
+ "Date":"14 September 2017",
53
+ "Confidence":".94"
54
+ },
55
+
56
+ "PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28°5'37.15"N, 91°7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang Spiral Leaf Green 17 May 2018"
57
+ :{"Collector":"PE-Xizang Expedition #PE6673",
58
+ "Location":"Xizang Autonomous Region, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity, Slopes near roadsides",
59
+ "Taxon":"Spiral Leaf Green",
60
+ "Date":"17 May 2018",
61
+ "Confidence":".76"
62
+ },
63
+
64
+ "Honey Plants Research Institute of the Chinese Academy of Agricultural Sciences Collection No.: 13687. May 7, 1993 Habitat Roadside Altitude: 1600 * Characters Shrub No. Herbarium of the Institute of Botany, Chinese Academy of Sciences Collector 3687 Scientific Name Height: m (cm) Diameter at breast height m (cm) Flower: White Fruit: Notes Blooming period: from January to July Honey: Scientific Name: Rosa Sericea Lindl. Appendix: Collector: cm 1 2 3 4 25 CHINESE NATIONAL HERBARUM ( 01833954 No 1479566 * Herbarium of the Institute of Botany, Chinese Academy of Sciences Sichuan SZECHUAN DET. Rosa sercea Lindl. var. Various Zhi 2009-02-16"
65
+ :{"Collector":"UNKNOWN",
66
+ "Location":"Sichuan, China, Roadside, Altitude: 1600",
67
+ "Taxon":"Rosa sericea",
68
+ "Date":"7 May 1993",
69
+ "Confidence":".45"
70
+ },
71
+ }
72
+
73
+ # ————————————
74
+ # FUNC
75
+ # ————————————
76
+
77
+ # few-shot randomizer
78
+ def get_random_pairs_list(input_dict, num_pairs=3):
79
+ if len(input_dict) < num_pairs:
80
+ return "Not enough elements in the dictionary to select the requested number of pairs"
81
+ keys = random.sample(list(input_dict.keys()), num_pairs)
82
+ return [(key, input_dict[key]) for key in keys]
83
+
84
+ # main gemini processor
85
+ def generate_metadata(results_df, shots):
86
+ responses = []
87
+ for input_text in results_df["extracted_text"]:
88
+
89
+ # FEW-SHOT RANDOMIZER
90
+ random_pairs = get_random_pairs_list(shots)
91
+
92
+ # PROMPT FORMATTING
93
+ prompt = \
94
+ """
95
+ Your goal is to translate (if necessary) and then extract four items from a
96
+ string of text: the name of the specimen collector, the location, the taxon
97
+ and/or any identifying information about the specimen, and the earliest date.
98
+ Your response should contain only JSON. Use the best information available
99
+ or insert 'UNKNOWN' if there is none. Provide a rough estimate of confidence
100
+ in your output ranging from 0-1 inside your JSON output.
101
 
102
+ Examples:
103
+
104
+ Input 1:
105
+ {shot1_input}
106
+ Output 1:
107
+ {{"Collector":"{shot1_output_collector}","Location":"{shot1_output_location}","Taxon":"{shot1_output_taxon}","Date":"{shot1_output_date}","Confidence":"{shot1_confidence}"}}
108
+
109
+ Input 2:
110
+ {shot2_input}
111
+ Output 2:
112
+ {{"Collector":"{shot2_output_collector}","Location":"{shot2_output_location}","Taxon":"{shot2_output_taxon}","Date":"{shot2_output_date},"Confidence":"{shot2_confidence}"}}
113
+
114
+ Input 3:
115
+ {shot3_input}
116
+ Output 3:
117
+ {{"Collector":"{shot3_output_collector}","Location":"{shot3_output_location}","Taxon":"{shot3_output_taxon}","Date":"{shot3_output_date},"Confidence":"{shot3_confidence}"}}
118
+
119
+ Your attempt:
120
+ Input:
121
+ {input_text}
122
+ Output:
123
+
124
+ """.format(
125
+ shot1_input = random_pairs[0][0],
126
+ shot1_output_collector = random_pairs[0][1]['Collector'],
127
+ shot1_output_location = random_pairs[0][1]['Location'],
128
+ shot1_output_taxon = random_pairs[0][1]['Taxon'],
129
+ shot1_output_date = random_pairs[0][1]['Date'],
130
+ shot1_confidence = random_pairs[0][1]['Confidence'],
131
+
132
+ shot2_input = random_pairs[1][0],
133
+ shot2_output_collector = random_pairs[1][1]['Collector'],
134
+ shot2_output_location = random_pairs[1][1]['Location'],
135
+ shot2_output_taxon = random_pairs[1][1]['Taxon'],
136
+ shot2_output_date = random_pairs[1][1]['Date'],
137
+ shot2_confidence = random_pairs[1][1]['Confidence'],
138
+
139
+ shot3_input = random_pairs[2][0],
140
+ shot3_output_collector = random_pairs[2][1]['Collector'],
141
+ shot3_output_location = random_pairs[2][1]['Location'],
142
+ shot3_output_taxon = random_pairs[2][1]['Taxon'],
143
+ shot3_output_date = random_pairs[2][1]['Date'],
144
+ shot3_confidence = random_pairs[2][1]['Confidence'],
145
+
146
+ input_text = input_text
147
+ )
148
+
149
+ response = model.generate_content(prompt)
150
+ responses.append(response)
151
+
152
+ return responses
153
+
154
+ # gemini response handler
155
+ def process_responses(responses):
156
+ text_responses = []
157
+ for response in responses:
158
+ extracted_text = to_markdown(response.text).data
159
+ text_responses.append(extracted_text.strip().replace('>', '')[1:])
160
+
161
+ json_responses = []
162
+ for text in text_responses:
163
+ try:
164
+ json_response = json.loads(re.search(r'{.*}', text, re.DOTALL).group())
165
+ json_responses.append(json_response)
166
+ except json.JSONDecodeError as e:
167
+ print("Failed on input", text_responses.index(text), "| Reason:", e)
168
+ continue
169
+
170
+ return json_responses
171
+
172
+ # main document AI processor
173
  def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
174
  opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
175
  client = documentai.DocumentProcessorServiceClient(client_options=opts)
 
185
  translated_text = translate_text(extracted_text)
186
  return extracted_text, translated_text
187
 
188
+ # file upload
189
  def unzip_and_find_jpgs(file_path):
190
  extract_path = "extracted_files"
191
  if os.path.exists(extract_path):
 
210
  jpg_files.append(full_path)
211
  return jpg_files
212
 
213
+ # ————————————
214
+ # MAIN
215
+ # ————————————
216
+
217
  def process_images(uploaded_file):
218
+ # make new dataframe each time this function is called
219
+ results_df = pd.DataFrame(columns=["filename", "collector", "location", "taxon", "date", "confidence", "extracted_text"])
220
 
221
+ # easy gradio filename storage
222
+ file_path = uploaded_file.name
223
 
224
  try:
225
  image_files = unzip_and_find_jpgs(file_path)
226
 
227
  if not image_files:
228
  return "No JPG files found in the zip."
229
+
230
+ print(image_files)
231
 
232
  for file_path in image_files:
233
+ # DOCUMENT AI PROCESSING IS HERE
234
+ extracted_text = batch_process_documents(file_path, "image/jpeg")
235
  new_row = pd.DataFrame([{
236
+ "filename": os.path.basename(file_path),
237
+ "extracted_text": extracted_text
 
238
  }])
239
  results_df = pd.concat([results_df, new_row], ignore_index=True)
240
+
241
+ # GEMINI PROCESSING IS HERE
242
+ responses = generate_metadata(results_df, shots)
243
+ processed_data = process_responses(responses)
244
+
245
+ # append extracted metadata
246
+ for idx, processed in enumerate(processed_data):
247
+ results_df.at[idx, "collector"] = processed.get("Collector", "")
248
+ results_df.at[idx, "location"] = processed.get("Location", "")
249
+ results_df.at[idx, "taxon"] = processed.get("Taxon", "")
250
+ results_df.at[idx, "date"] = processed.get("Date", "")
251
+ results_df.at[idx, "confidence"] = processed.get("Confidence", "")
252
+
253
  except Exception as e:
254
+ return f"An error occurred: {str(e)} on file {file_path}"
255
 
256
  html_output = results_df.to_html()
257
 
258
+ # CSV saving (with temp file)
259
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
260
+ results_df.to_csv(temp_file.name, index=False)
261
 
262
+ temp_file.close()
263
 
264
+ # return HTML and output CSV path
265
  return html_output, temp_file.name
266
 
267
+ # ———————————
268
+ # UI
269
+ # ———————————
270
+
271
  with gr.Blocks() as interface:
272
  with gr.Row():
273
  gr.Markdown("# Document AI Translation")