File size: 5,031 Bytes
10364bf
c2ba438
36703af
c2ba438
 
32389f0
c2ba438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10364bf
c2ba438
 
 
 
 
 
 
10364bf
c2ba438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d3a749
c2ba438
 
 
 
 
 
 
 
 
 
 
 
 
 
f77ed0a
c2ba438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce75747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import json
import os

import gradio as gr
import pandas as pd
from PIL import Image
from google import genai

# Client and prompt setup
client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))
model_name = "gemini-2.0-flash-exp" # Change to other models, but be careful as response might be with different structure
safety_settings = [
    genai.types.SafetySetting(
        category="HARM_CATEGORY_DANGEROUS_CONTENT",
        threshold="BLOCK_ONLY_HIGH",
    ),
]
bounding_box_system_instructions = """Return bounding boxes as a JSON array with labels, CO2 estimate, and an explanation. Never return masks or code fencing. Limit to 5 objects."""
prompt = """Provide an estimation of how much CO2 is involved in all activities in this picture. Give CO2 in grams.

As examples, think of transport, smoking, meat, and other similar emission activities.
Do not provide actions that don't have CO2 emissions.

Be comprehensive, but don't list more than 10 objects. Detect the 2D bounding boxes of these activities,
including the label, the CO2 gram quantity, and a short explanation explaining the estimation
for each activity.
"""

def parse_json(json_output):
    # Based on https://github.com/google-gemini/cookbook/blob/main/gemini-2/spatial_understanding.ipynb
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
            break  # Exit the loop once "```json" is found
    return json.loads(json_output)

def parse_info(image, json_data):
    width, height = image.size
    df_data = []
    boxes_with_labels = []

    # Iterate over each detected action actions
    for action in json_data:
        box_2d = action.get("box_2d")
        label = action.get("label")
        co2_grams = action.get("co2_grams")
        explanation = action.get("explanation")

        if not all([box_2d, label, co2_grams, explanation]):
            continue
        
        # Convert normalized coordinates to absolute coordinates
        abs_y1 = int(box_2d[0] / 1000 * height)
        abs_x1 = int(box_2d[1] / 1000 * width)
        abs_y2 = int(box_2d[2] / 1000 * height)
        abs_x2 = int(box_2d[3] / 1000 * width)

        abs_x1, abs_x2 = min(abs_x1, abs_x2), max(abs_x1, abs_x2)
        abs_y1, abs_y2 = min(abs_y1, abs_y2), max(abs_y1, abs_y2)

        boxes_with_labels.append([(abs_x1, abs_y1, abs_x2, abs_y2), label])

        df_data.append({
            "label": label,
            "co2": co2_grams,
            "explanation": explanation
        })

    return boxes_with_labels, pd.DataFrame(df_data)

def estimate_co2(image):
    resized_image = image.resize(
        (1024, int(1024 * image.size[1] / image.size[0])), 
        Image.Resampling.LANCZOS
    )

    # Get resuls from model
    response = client.models.generate_content(
        model=model_name,
        contents=[prompt, resized_image],
        config = genai.types.GenerateContentConfig(
            system_instruction=bounding_box_system_instructions,
            temperature=0.4,
            safety_settings=safety_settings
        )
    )

    json_data = parse_json(response.text)
    boxes_with_labels, data = parse_info(resized_image, json_data)
    return [resized_image, boxes_with_labels], data

iface = gr.Interface(
    fn=estimate_co2,
    inputs=gr.Image(type="pil"),
    outputs=[
        gr.AnnotatedImage(),
        gr.Dataframe(
            label="CO2 Estimation Data",
            interactive=False,
            headers=["co2", "item_name", "rationale"]
        )
    ],
    title="CO2 Estimation from Images",
    description="Upload an image and get an estimation of the CO2 involved in the activities depicted.",
    article="This is a very rough estimate, and can be misleading or factually inaccurate. Take this as a demo project and not as scientific/exact results."
    #examples=[
    #    ["example.jpeg"]  # Add an example image if you have one
    #],
)

markdown = """# CO2 Estimation

Upload an image and get an **estimation** of the CO2 involved in the activities depicted. This is a very rough estimate, and can be misleading or factually inaccurate. Take this as a demo project and not as scientific/exact results.

Powered by [the Gemini API](https://ai.google.dev/gemini-api/docs) and [AI Studio](https://aistudio.google.com/).
"""

with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown(markdown)
    with gr.Row():
        input_image = gr.Image(type="pil", label="Input Image")
        output_image = gr.AnnotatedImage(label="Output Image")
    with gr.Row():
        output_dataframe = gr.Dataframe(
            label="CO2 Estimated Data",
            interactive=False,
            headers=["co2", "item_name", "rationale"]
        )

    input_image.change(
        fn=estimate_co2,
        inputs=input_image,
        outputs=[output_image, output_dataframe]
    )