File size: 11,879 Bytes
9ac2440 e4387b3 b42d6ec 290f08e b42d6ec a04d6d5 3801b0c 1b3597d 3801b0c c980b7a 3801b0c 444b42f 3801b0c 4d57e5c 3801b0c 4d57e5c b42d6ec eea98e7 b42d6ec 4d57e5c eb822d4 4d57e5c b42d6ec 290f08e 4d57e5c 3801b0c 4d57e5c 07e2341 290f08e 07e2341 4d57e5c 7b79b85 4d57e5c 37bfbd1 7b79b85 4d57e5c 7b79b85 c8197d8 4d57e5c 3801b0c 4d57e5c 3801b0c 4bf21c7 3801b0c 9ac2440 4d57e5c 9ac2440 4d57e5c 9ac2440 3801b0c 1346633 4d57e5c 3801b0c 4d57e5c 3801b0c 4d57e5c 3801b0c 4d57e5c 3801b0c 9ac2440 bc9779f 3801b0c bc9779f 3801b0c bc9779f 3801b0c bc9779f 3801b0c bc9779f 290f08e bc9779f cf79285 bc9779f cf79285 bc9779f 7b79b85 4d57e5c 9d00495 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
import zipfile
import os
import gradio as gr
import tempfile
import textwrap
import json
import google.generativeai as genai
from IPython.display import Markdown
import random
import re
from time import sleep
# โโโโโโโโโโโโโ
# SETUP
# โโโโโโโโโโโโโ
# CREDENTIALS FILE PATH
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"
# GEMINI API KEY
api_key = os.getenv("API_KEY")
genai.configure(api_key=api_key)
# GEMINI MODEL DECLARATION
model = genai.GenerativeModel('gemini-1.0-pro')
# DOCUMENT AI DETAILS
project_id = "herbaria-ai"
location = "us"
processor_id = "de954414712822b3"
# helper function for processing gemini responses (which are in markdown)
def to_markdown(text):
text = text.replace('โข', ' *')
return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
# few-shot samples
shots = \
{
"Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity ่ฅฟ่่ชๆฒปๅบๅฑฑๅๅธๆดๆๅฟๆๅบท้ๅกไน
ๅฏบ้่ฟ 28ยฐ5'37.15N, 91ยฐ7'24.74โณE; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6657 14 September 2017 M4 5 6 7 8 NCIL ไธญๅฝๆฐๅญๆค็ฉๆ ๆฌ้ฆ Nยฐ 2604988 ่ฅฟ่ TIBET ไธญๅฝ็งๅญฆ้ข ๆค็ฉ็ ็ฉถๆ ๆ ๆฌ้ฆ PE CHINESE NATIONAL HERBARIUM (PE) 02334122 #PE6657 ASTERACEAE ่็ง Anaphalis contorta (D. Don) Hook. f. ้ดๅฎไบบ:ๅผ ๅฝ่ฟ Guo-Jin ZHANG ๆๅถ้ฆ้ 17 May 2018"
:{"Collector":"Guo-Jin, Zhang",
"Location":"Xizang Autonomous Region, Shannan City, Lhozhag County, Lhakang Town, Kharchhu Gompa vincinity, Slopes near roadsides",
"Taxon":"Asteraceae; Anaphalis contorta (D. Don) Hook. f.",
"Date":"14 September 2017",
"Confidence":".94"
},
"PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28ยฐ5'37.15"N, 91ยฐ7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28ยฐ5'37.15"N, 91ยฐ7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang Spiral Leaf Green 17 May 2018"
:{"Collector":"PE-Xizang Expedition #PE6673",
"Location":"Xizang Autonomous Region, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity, Slopes near roadsides",
"Taxon":"Spiral Leaf Green",
"Date":"17 May 2018",
"Confidence":".76"
},
"Honey Plants Research Institute of the Chinese Academy of Agricultural Sciences Collection No.: 13687. May 7, 1993 Habitat Roadside Altitude: 1600 * Characters Shrub No. Herbarium of the Institute of Botany, Chinese Academy of Sciences Collector 3687 Scientific Name Height: m (cm) Diameter at breast height m (cm) Flower: White Fruit: Notes Blooming period: from January to July Honey: Scientific Name: Rosa Sericea Lindl. Appendix: Collector: cm 1 2 3 4 25 CHINESE NATIONAL HERBARUM ( 01833954 No 1479566 * Herbarium of the Institute of Botany, Chinese Academy of Sciences Sichuan SZECHUAN DET. Rosa sercea Lindl. var. Various Zhi 2009-02-16"
:{"Collector":"UNKNOWN",
"Location":"Sichuan, China, Roadside, Altitude: 1600",
"Taxon":"Rosa sericea",
"Date":"7 May 1993",
"Confidence":".45"
},
}
# โโโโโโโโโโโโ
# FUNC
# โโโโโโโโโโโโ
# few-shot randomizer
def get_random_pairs_list(input_dict, num_pairs=3):
if len(input_dict) < num_pairs:
return "Not enough elements in the dictionary to select the requested number of pairs"
keys = random.sample(list(input_dict.keys()), num_pairs)
return [(key, input_dict[key]) for key in keys]
# main gemini processor
def generate_metadata(results_df, shots):
responses = []
for input_text in results_df["extracted_text"]:
# FEW-SHOT RANDOMIZER
random_pairs = get_random_pairs_list(shots)
# PROMPT FORMATTING
prompt = \
"""
Your goal is to translate (if necessary) and then extract four items from a
string of text: the name of the specimen collector, the location, the taxon
and/or any identifying information about the specimen, and the earliest date.
Your response should contain only JSON. Use the best information available
or insert 'UNKNOWN' if there is none. Provide a rough estimate of confidence
in your output ranging from 0-1 inside your JSON output.
Examples:
Input 1:
{shot1_input}
Output 1:
{{"Collector":"{shot1_output_collector}","Location":"{shot1_output_location}","Taxon":"{shot1_output_taxon}","Date":"{shot1_output_date}","Confidence":"{shot1_confidence}"}}
Input 2:
{shot2_input}
Output 2:
{{"Collector":"{shot2_output_collector}","Location":"{shot2_output_location}","Taxon":"{shot2_output_taxon}","Date":"{shot2_output_date},"Confidence":"{shot2_confidence}"}}
Input 3:
{shot3_input}
Output 3:
{{"Collector":"{shot3_output_collector}","Location":"{shot3_output_location}","Taxon":"{shot3_output_taxon}","Date":"{shot3_output_date},"Confidence":"{shot3_confidence}"}}
Your attempt:
Input:
{input_text}
Output:
""".format(
shot1_input = random_pairs[0][0],
shot1_output_collector = random_pairs[0][1]['Collector'],
shot1_output_location = random_pairs[0][1]['Location'],
shot1_output_taxon = random_pairs[0][1]['Taxon'],
shot1_output_date = random_pairs[0][1]['Date'],
shot1_confidence = random_pairs[0][1]['Confidence'],
shot2_input = random_pairs[1][0],
shot2_output_collector = random_pairs[1][1]['Collector'],
shot2_output_location = random_pairs[1][1]['Location'],
shot2_output_taxon = random_pairs[1][1]['Taxon'],
shot2_output_date = random_pairs[1][1]['Date'],
shot2_confidence = random_pairs[1][1]['Confidence'],
shot3_input = random_pairs[2][0],
shot3_output_collector = random_pairs[2][1]['Collector'],
shot3_output_location = random_pairs[2][1]['Location'],
shot3_output_taxon = random_pairs[2][1]['Taxon'],
shot3_output_date = random_pairs[2][1]['Date'],
shot3_confidence = random_pairs[2][1]['Confidence'],
input_text = input_text
)
response = model.generate_content(prompt)
responses.append(response)
return responses
# gemini response handler
def process_responses(responses):
text_responses = []
for response in responses:
extracted_text = to_markdown(response.text).data
text_responses.append(extracted_text.strip().replace('>', '')[1:])
json_responses = []
for text in text_responses:
try:
json_response = json.loads(re.search(r'{.*}', text, re.DOTALL).group())
json_responses.append(json_response)
except json.JSONDecodeError as e:
print("Failed on input", text_responses.index(text), "| Reason:", e)
continue
return json_responses
# main document AI processor
def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
with open(file_path, "rb") as file_stream:
raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)
name = client.processor_path(project_id, location, processor_id)
request = documentai.ProcessRequest(name=name, raw_document=raw_document)
result = client.process_document(request=request)
extracted_text = result.document.text.replace('\n', ' ')
return extracted_text
# file upload
def unzip_and_find_jpgs(file_path):
extract_path = "extracted_files"
if os.path.exists(extract_path):
# clear dir
for root, dirs, files in os.walk(extract_path, topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
os.rmdir(extract_path)
os.makedirs(extract_path, exist_ok=True)
jpg_files = []
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
for root, dirs, files in os.walk(extract_path):
if '__MACOSX' in root:
continue
for file in files:
if file.lower().endswith('.jpg'):
full_path = os.path.join(root, file)
jpg_files.append(full_path)
return jpg_files
# โโโโโโโโโโโโ
# MAIN
# โโโโโโโโโโโโ
def process_images(uploaded_file):
# make new dataframe each time this function is called
results_df = pd.DataFrame(columns=["filename", "collector", "location", "taxon", "date", "confidence", "extracted_text"])
# easy gradio filename storage
file_path = uploaded_file.name
try:
image_files = unzip_and_find_jpgs(file_path)
if not image_files:
return "No JPG files found in the zip."
print(image_files)
for file_path in image_files:
# DOCUMENT AI PROCESSING IS HERE
extracted_text = batch_process_documents(file_path, "image/jpeg")
new_row = pd.DataFrame([{
"filename": os.path.basename(file_path),
"extracted_text": extracted_text
}])
results_df = pd.concat([results_df, new_row], ignore_index=True)
# GEMINI PROCESSING IS HERE
responses = generate_metadata(results_df, shots)
processed_data = process_responses(responses)
# append extracted metadata
for idx, processed in enumerate(processed_data):
results_df.at[idx, "collector"] = processed.get("Collector", "")
results_df.at[idx, "location"] = processed.get("Location", "")
results_df.at[idx, "taxon"] = processed.get("Taxon", "")
results_df.at[idx, "date"] = processed.get("Date", "")
results_df.at[idx, "confidence"] = processed.get("Confidence", "")
except Exception as e:
return f"An error occurred: {str(e)} on file {file_path}"
html_output = results_df.to_html()
# CSV saving (with temp file)
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
results_df.to_csv(temp_file.name, index=False)
temp_file.close()
# return HTML and output CSV path
return html_output, temp_file.name
# โโโโโโโโโโโ
# UI
# โโโโโโโโโโโ
with gr.Blocks() as interface:
with gr.Row():
gr.Markdown("# Herbaria Batch Metadata Extraction")
gr.Markdown("Upload a ZIP file containing JPEG/JPG images, and the system will translate and extract the text from each image.")
with gr.Row():
file_input = gr.File(label="Upload ZIP File")
with gr.Row():
html_output = gr.HTML(label="Extracted Text From Your Herbaria Images")
with gr.Row():
file_output = gr.File(label="Download this file to receive the extracted labels from the images.")
file_input.change(process_images, inputs=file_input, outputs=[html_output, file_output])
if __name__ == "__main__":
interface.launch(debug=True) |