File size: 11,879 Bytes
9ac2440
e4387b3
 
 
b42d6ec
290f08e
b42d6ec
a04d6d5
3801b0c
 
 
 
 
 
 
 
 
 
 
 
 
1b3597d
3801b0c
 
c980b7a
 
3801b0c
 
 
 
 
 
 
 
444b42f
3801b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d57e5c
3801b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d57e5c
b42d6ec
eea98e7
b42d6ec
4d57e5c
 
eb822d4
4d57e5c
 
 
b42d6ec
290f08e
 
4d57e5c
3801b0c
4d57e5c
 
07e2341
290f08e
07e2341
 
 
 
 
 
 
4d57e5c
 
 
 
7b79b85
4d57e5c
37bfbd1
7b79b85
4d57e5c
7b79b85
c8197d8
4d57e5c
 
3801b0c
 
 
 
4d57e5c
3801b0c
 
4bf21c7
3801b0c
 
9ac2440
4d57e5c
 
9ac2440
4d57e5c
9ac2440
3801b0c
 
1346633
4d57e5c
3801b0c
 
4d57e5c
3801b0c
 
4d57e5c
 
3801b0c
 
 
 
 
 
 
 
 
 
 
 
 
4d57e5c
3801b0c
9ac2440
bc9779f
 
3801b0c
 
 
bc9779f
3801b0c
bc9779f
3801b0c
bc9779f
 
3801b0c
 
 
 
bc9779f
 
290f08e
 
bc9779f
 
 
cf79285
bc9779f
cf79285
bc9779f
 
7b79b85
4d57e5c
9d00495
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import pandas as pd
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
from google.cloud.documentai_v1.types import RawDocument
import zipfile
import os
import gradio as gr
import tempfile
import textwrap
import json
import google.generativeai as genai
from IPython.display import Markdown
import random
import re
from time import sleep

# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
#     SETUP
# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”

# CREDENTIALS FILE PATH
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "herbaria-ai-3c860bcb0f44.json"

# GEMINI API KEY
api_key = os.getenv("API_KEY")
genai.configure(api_key=api_key)

# GEMINI MODEL DECLARATION
model = genai.GenerativeModel('gemini-1.0-pro')

# DOCUMENT AI DETAILS
project_id = "herbaria-ai"
location = "us"
processor_id = "de954414712822b3"

# helper function for processing gemini responses (which are in markdown)
def to_markdown(text):
    text = text.replace('โ€ข', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

# few-shot samples
shots = \
{
    "Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity ่ฅฟ่—่‡ชๆฒปๅŒบๅฑฑๅ—ๅธ‚ๆด›ๆ‰ŽๅŽฟๆ‹‰ๅบท้•‡ๅกไน…ๅฏบ้™„่ฟ‘ 28ยฐ5'37.15N, 91ยฐ7'24.74โ€ณE; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6657 14 September 2017 M4 5 6 7 8 NCIL ไธญๅ›ฝๆ•ฐๅญ—ๆค็‰ฉๆ ‡ๆœฌ้ฆ† Nยฐ 2604988 ่ฅฟ่— TIBET ไธญๅ›ฝ็ง‘ๅญฆ้™ข ๆค็‰ฉ็ ”็ฉถๆ‰€ ๆ ‡ๆœฌ้ฆ† PE CHINESE NATIONAL HERBARIUM (PE) 02334122 #PE6657 ASTERACEAE ่Š็ง‘ Anaphalis contorta (D. Don) Hook. f. ้‰ดๅฎšไบบ:ๅผ ๅ›ฝ่ฟ› Guo-Jin ZHANG ๆ—‹ๅถ้ฆ™้’ 17 May 2018"
    :{"Collector":"Guo-Jin, Zhang",
      "Location":"Xizang Autonomous Region, Shannan City, Lhozhag County, Lhakang Town, Kharchhu Gompa vincinity, Slopes near roadsides",
      "Taxon":"Asteraceae; Anaphalis contorta (D. Don) Hook. f.",
      "Date":"14 September 2017",
      "Confidence":".94"
    },

    "PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28ยฐ5'37.15"N, 91ยฐ7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang CHINA, Xizang, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity 28ยฐ5'37.15"N, 91ยฐ7'24.74"E; 3934 m Herbs. Slopes near roadsides. PE-Xizang Expedition #PE6673 9 NSIT Chinese National Herbarium (PE) Plants of Xizang Spiral Leaf Green 17 May 2018"
    :{"Collector":"PE-Xizang Expedition #PE6673",
      "Location":"Xizang Autonomous Region, Lhoka City, Lhozhag County, Lhakang Town, Kharchhu Gompa vicinity, Slopes near roadsides",
      "Taxon":"Spiral Leaf Green",
      "Date":"17 May 2018",
      "Confidence":".76"
    },

    "Honey Plants Research Institute of the Chinese Academy of Agricultural Sciences Collection No.: 13687. May 7, 1993 Habitat Roadside Altitude: 1600 * Characters Shrub No. Herbarium of the Institute of Botany, Chinese Academy of Sciences Collector 3687 Scientific Name Height: m (cm) Diameter at breast height m (cm) Flower: White Fruit: Notes Blooming period: from January to July Honey: Scientific Name: Rosa Sericea Lindl. Appendix: Collector: cm 1 2 3 4 25 CHINESE NATIONAL HERBARUM ( 01833954 No 1479566 * Herbarium of the Institute of Botany, Chinese Academy of Sciences Sichuan SZECHUAN DET. Rosa sercea Lindl. var. Various Zhi 2009-02-16"
    :{"Collector":"UNKNOWN",
      "Location":"Sichuan, China, Roadside, Altitude: 1600",
      "Taxon":"Rosa sericea",
      "Date":"7 May 1993",
      "Confidence":".45"
    },
}

# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
#     FUNC
# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”

# few-shot randomizer
def get_random_pairs_list(input_dict, num_pairs=3):
    if len(input_dict) < num_pairs:
        return "Not enough elements in the dictionary to select the requested number of pairs"
    keys = random.sample(list(input_dict.keys()), num_pairs)
    return [(key, input_dict[key]) for key in keys]

# main gemini processor
def generate_metadata(results_df, shots):
    responses = []
    for input_text in results_df["extracted_text"]:

        # FEW-SHOT RANDOMIZER
        random_pairs = get_random_pairs_list(shots)

        # PROMPT FORMATTING
        prompt = \
        """
        Your goal is to translate (if necessary) and then extract four items from a
        string of text: the name of the specimen collector, the location, the taxon
        and/or any identifying information about the specimen, and the earliest date.
        Your response should contain only JSON. Use the best information available
        or insert 'UNKNOWN' if there is none. Provide a rough estimate of confidence
        in your output ranging from 0-1 inside your JSON output.

        Examples:

        Input 1:
        {shot1_input}
        Output 1:
        {{"Collector":"{shot1_output_collector}","Location":"{shot1_output_location}","Taxon":"{shot1_output_taxon}","Date":"{shot1_output_date}","Confidence":"{shot1_confidence}"}}

        Input 2:
        {shot2_input}
        Output 2:
        {{"Collector":"{shot2_output_collector}","Location":"{shot2_output_location}","Taxon":"{shot2_output_taxon}","Date":"{shot2_output_date},"Confidence":"{shot2_confidence}"}}

        Input 3:
        {shot3_input}
        Output 3:
        {{"Collector":"{shot3_output_collector}","Location":"{shot3_output_location}","Taxon":"{shot3_output_taxon}","Date":"{shot3_output_date},"Confidence":"{shot3_confidence}"}}

        Your attempt:
        Input:
        {input_text}
        Output:

        """.format(
        shot1_input = random_pairs[0][0],
        shot1_output_collector = random_pairs[0][1]['Collector'],
        shot1_output_location = random_pairs[0][1]['Location'],
        shot1_output_taxon = random_pairs[0][1]['Taxon'],
        shot1_output_date = random_pairs[0][1]['Date'],
        shot1_confidence = random_pairs[0][1]['Confidence'],

        shot2_input = random_pairs[1][0],
        shot2_output_collector = random_pairs[1][1]['Collector'],
        shot2_output_location = random_pairs[1][1]['Location'],
        shot2_output_taxon = random_pairs[1][1]['Taxon'],
        shot2_output_date = random_pairs[1][1]['Date'],
        shot2_confidence = random_pairs[1][1]['Confidence'],

        shot3_input = random_pairs[2][0],
        shot3_output_collector = random_pairs[2][1]['Collector'],
        shot3_output_location = random_pairs[2][1]['Location'],
        shot3_output_taxon = random_pairs[2][1]['Taxon'],
        shot3_output_date = random_pairs[2][1]['Date'],
        shot3_confidence = random_pairs[2][1]['Confidence'],

        input_text = input_text
        )

        response = model.generate_content(prompt)
        responses.append(response)

    return responses

# gemini response handler
def process_responses(responses):
    text_responses = []
    for response in responses:
        extracted_text = to_markdown(response.text).data
        text_responses.append(extracted_text.strip().replace('>', '')[1:])

    json_responses = []
    for text in text_responses:
        try:
            json_response = json.loads(re.search(r'{.*}', text, re.DOTALL).group())
            json_responses.append(json_response)
        except json.JSONDecodeError as e:
            print("Failed on input", text_responses.index(text), "| Reason:", e)
            continue

    return json_responses

# main document AI processor
def batch_process_documents(file_path: str, file_mime_type: str) -> tuple:
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    with open(file_path, "rb") as file_stream:
        raw_document = RawDocument(content=file_stream.read(), mime_type=file_mime_type)

    name = client.processor_path(project_id, location, processor_id)
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)
    result = client.process_document(request=request)

    extracted_text = result.document.text.replace('\n', ' ')
    return extracted_text

# file upload
def unzip_and_find_jpgs(file_path):
    extract_path = "extracted_files"
    if os.path.exists(extract_path):
        # clear dir
        for root, dirs, files in os.walk(extract_path, topdown=False):
            for name in files:
                os.remove(os.path.join(root, name))
            for name in dirs:
                os.rmdir(os.path.join(root, name))
        os.rmdir(extract_path)

    os.makedirs(extract_path, exist_ok=True)
    jpg_files = []
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
        for root, dirs, files in os.walk(extract_path):
            if '__MACOSX' in root:
                continue
            for file in files:
                if file.lower().endswith('.jpg'):
                    full_path = os.path.join(root, file)
                    jpg_files.append(full_path)
    return jpg_files

# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
#     MAIN
# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”

def process_images(uploaded_file):
    # make new dataframe each time this function is called
    results_df = pd.DataFrame(columns=["filename", "collector", "location", "taxon", "date", "confidence", "extracted_text"]) 

    # easy gradio filename storage
    file_path = uploaded_file.name

    try:
        image_files = unzip_and_find_jpgs(file_path)

        if not image_files:
            return "No JPG files found in the zip."
        
        print(image_files)

        for file_path in image_files:
            # DOCUMENT AI PROCESSING IS HERE
            extracted_text = batch_process_documents(file_path, "image/jpeg")
            new_row = pd.DataFrame([{
                "filename": os.path.basename(file_path),
                "extracted_text": extracted_text
            }])
            results_df = pd.concat([results_df, new_row], ignore_index=True)

        # GEMINI PROCESSING IS HERE
        responses = generate_metadata(results_df, shots)
        processed_data = process_responses(responses)

        # append extracted metadata
        for idx, processed in enumerate(processed_data):
            results_df.at[idx, "collector"] = processed.get("Collector", "")
            results_df.at[idx, "location"] = processed.get("Location", "")
            results_df.at[idx, "taxon"] = processed.get("Taxon", "")
            results_df.at[idx, "date"] = processed.get("Date", "")
            results_df.at[idx, "confidence"] = processed.get("Confidence", "")

    except Exception as e:
        return f"An error occurred: {str(e)} on file {file_path}"

    html_output = results_df.to_html()

    # CSV saving (with temp file)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
    results_df.to_csv(temp_file.name, index=False)

    temp_file.close()

    # return HTML and output CSV path
    return html_output, temp_file.name

# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
#     UI
# โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”

with gr.Blocks() as interface:
    with gr.Row():
        gr.Markdown("# Herbaria Batch Metadata Extraction")
        gr.Markdown("Upload a ZIP file containing JPEG/JPG images, and the system will translate and extract the text from each image.")
    with gr.Row():
        file_input = gr.File(label="Upload ZIP File")
    with gr.Row():
        html_output = gr.HTML(label="Extracted Text From Your Herbaria Images")
    with gr.Row():
        file_output = gr.File(label="Download this file to receive the extracted labels from the images.")

    file_input.change(process_images, inputs=file_input, outputs=[html_output, file_output])

if __name__ == "__main__":
    interface.launch(debug=True)