Spaces:
Runtime error
Runtime error
File size: 13,582 Bytes
393f86d 32ac110 ce8bd36 393f86d db8cc8e ce8bd36 db8cc8e 32ac110 419df8a 1f6c998 db8cc8e 32ac110 419df8a 32ac110 9ca002e ce8bd36 32ac110 393f86d ce8bd36 393f86d ce8bd36 393f86d 419df8a 393f86d 419df8a 118e15d 1cf9ef7 393f86d 795ccdc 393f86d ce8bd36 419df8a 510ae87 419df8a 393f86d ce8bd36 795ccdc 419df8a f924cbe 419df8a f924cbe 1f6c998 f924cbe eda3176 1f6c998 f924cbe 419df8a 1f6c998 419df8a f924cbe 419df8a f924cbe 1f6c998 f924cbe eda3176 f924cbe 118e15d 32ac110 ce8bd36 c0f2fe1 ce8bd36 1f6c998 ce8bd36 419df8a 248e2bb c0f2fe1 1f6c998 ce8bd36 2a081f9 c0f2fe1 ce8bd36 393f86d ce8bd36 419df8a ce8bd36 c0f2fe1 ce8bd36 e416978 ce8bd36 419df8a 11ad112 419df8a e416978 419df8a ce8bd36 a23bf4a 419df8a ce8bd36 419df8a ce8bd36 1f6c998 419df8a 1cf9ef7 1f6c998 ce8bd36 248e2bb 118e15d 510ae87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import streamlit as st
import pandas as pd
import os, csv
from huggingface_hub import hf_hub_download, HfApi
import math
HF_TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN')
CACHED_FILE_PATH = hf_hub_download(repo_id="sasha/co2_submissions", filename="co2_emissions.csv", repo_type="dataset")
api = HfApi()
def write_to_csv(hardware, gpu_tdp, num_gpus, training_time, provider, carbon_intensity, dynamic_emissions, experimentation_time, experimental_emissions, pue, pue_emissions, embodied_type, embodied_emissions, model_info):
st.session_state["is_shared"] = True
with open(CACHED_FILE_PATH,'a', newline='') as f:
writer = csv.writer(f)
writer.writerow([hardware, gpu_tdp, num_gpus, training_time, provider, carbon_intensity, dynamic_emissions, experimentation_time, experimental_emissions, pue, pue_emissions, embodied_type, embodied_emissions, model_info])
api.upload_file(
path_or_fileobj=CACHED_FILE_PATH,
path_in_repo="co2_emissions.csv",
repo_id="sasha/co2_submissions",
repo_type="dataset",
)
st.set_page_config(
page_title="AI Carbon Calculator",
layout="wide",
)
tdp_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/gpus.csv"
compute_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/impact.csv"
electricity_url = "https://raw.githubusercontent.com/mlco2/impact/master/data/2021-10-27yearly_averages.csv"
server_sheet_id = "1DqYgQnEDLQVQm5acMAhLgHLD8xXCG9BIrk-_Nv6jF3k"
server_sheet_name = "Server%20Carbon%20Footprint"
server_url = f"https://docs.google.com/spreadsheets/d/{server_sheet_id}/gviz/tq?tqx=out:csv&sheet={server_sheet_name}"
embodied_gpu_sheet_name = "Scope%203%20Ratios"
embodied_gpu_url = f"https://docs.google.com/spreadsheets/d/{server_sheet_id}/gviz/tq?tqx=out:csv&sheet={embodied_gpu_sheet_name}"
TDP =pd.read_csv(tdp_url)
instances = pd.read_csv(compute_url)
providers = [p.upper() for p in instances['provider'].unique().tolist()]
providers.append('Local/Private Infastructure')
### Default values
hardware = "N/A"
gpu_tdp = 0
num_gpus = 0
training_time = 0.0
provider = "N/A"
carbon_intensity = 0.0
dynamic_emissions = 0.0
experimentation_time = 0.0
experimental_emissions = 0.0
pue = 1.0
pue_emissions = 0.0
embodied_type = 0.0
embodied_emissions = 0.0
model_info = "N/A"
### Conversion factors
kg_per_mile = 0.348
embodied_conversion_factor = 0.0289
miles_km_conversion_factor = 0.62137119
if "is_shared" not in st.session_state:
st.session_state["is_shared"] = False
electricity = pd.read_csv(electricity_url)
servers = pd.read_csv(server_url)
embodied_gpu = pd.read_csv(embodied_gpu_url)
#st.image('images/MIT_carbon_image_narrow.png', use_column_width=True, caption = 'Image credit: ')
st.title("AI Carbon Calculator")
st.markdown('## Estimate your AI model\'s CO2 carbon footprint! ๐๐ฅ๏ธ๐')
st.markdown('### Calculating the carbon footprint of AI models can be hard... this tool is here to help!')
st.markdown('##### Use the calculators below to calculate different aspects of your model\'s carbon footprint ' \
'and don\'t forget to share your data to help the community better understand the carbon emissions of AI!')
st.markdown('### Dynamic Emissions ๐')
st.markdown('##### These are the emissions produced by generating the electricity necessary for powering model training.')
with st.expander("Calculate the dynamic emissions of your model"):
col1, col2, col3, col4, col5 = st.columns(5)
with col1:
hardware = st.selectbox('Hardware used', TDP['name'].tolist())
gpu_tdp = TDP['tdp_watts'][TDP['name'] == hardware].tolist()[0]
st.markdown("Different hardware has different efficiencies, which impacts how much energy you use.")
with col2:
num_gpus = st.text_input('Number of GPUs/CPUs/TPUs used', value = 16)
st.markdown('If you can\'t find your hardware in the list, select the closest similar model.')
with col3:
training_time = st.number_input('Total training time (in hours)', value = 0.0)
st.markdown('You can find this number in your training logs or TensorBoards')
with col4:
provider = st.selectbox('Provider used', providers)
st.markdown('If you can\'t find your provider here, select "Local/Private Infrastructure".')
with col5:
if provider != 'Local/Private Infastructure':
provider_instances = instances['region'][instances['provider'] == provider.lower()].unique().tolist()
region = st.selectbox('Region used', provider_instances)
carbon_intensity = instances['impact'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]
else:
carbon_intensity = st.number_input('Carbon intensity of your energy grid, in grams of CO2 per kWh')
st.markdown('You can consult a resource like the [IEA](https://www.iea.org/countries) or '
' [Electricity Map](https://app.electricitymaps.com/) to get this information.')
dynamic_emissions = round(gpu_tdp * float(num_gpus)*training_time * carbon_intensity/1000000)
st.metric(label="Dynamic emissions", value=str(dynamic_emissions)+' kilograms of CO2eq')
st.info('This is roughly equivalent to '+ str(round(dynamic_emissions/kg_per_mile)) + ' miles ('+ str(round(dynamic_emissions/kg_per_mile/miles_km_conversion_factor)) + ' km) driven in an average US car'
'. [(Source: energy.gov)](https://www.energy.gov/eere/vehicles/articles/fotw-1223-january-31-2022-average-carbon-dioxide-emissions-2021-model-year)')
st.markdown('### Experimental Emissions ๐ฉโ๐ฌ')
st.markdown('##### These are the emissions produced by generating the electricity necessary for powering the experiments and tests needed to pick your final model architecture '
'and parameters.')
with st.expander("Calculate the experimental emissions of your model"):
#st.info('Consult your training logs to figure out how many ablations, baselines and experiments were run before converging on the final model.')
experimentation_time = st.number_input(label='Number of hours of experimentation (including ablations, baselines and evaluation)', value=training_time)
st.info('As a baseline, language models such as [OPT](https://arxiv.org/pdf/2205.01068.pdf) and [BLOOM](https://arxiv.org/abs/2211.02001)'
' found that experimentation roughly doubles the amount of compute used by training the model itself.')
experimental_emissions = round(gpu_tdp * (experimentation_time) * carbon_intensity/1000000)
st.metric(label="Experimental emissions", value=str(experimental_emissions)+' kilograms of CO2eq')
st.markdown('### Datacenter (Overhead) Emissions ๐')
st.markdown('##### These are the emissions produced by generating the electricity needed to power the rest of the infrastructure'
'used for model training -- the datacenter, network, heating/cooling, storage, etc.')
with st.expander("Calculate the datacenter emissions of your model"):
st.info('A proxy often used to reflect idle emissions is PUE (Power Usage Effectiveness), which represents '
' the ratio of energy used for computing overheads like cooling, which varies depending on the data center.')
pue = float(instances['PUE'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0])
source = instances['PUE source'][(instances['provider'] == provider.lower()) & (instances['region'] == region)].tolist()[0]
if math.isnan(pue) == True:
if provider != 'Local/Private Infastructure':
st.markdown('##### The exact information isn\'t available for this datacenter! We will use your provider\'s average instead, which is:')
if provider == 'AWS':
pue = 1.135
st.markdown('#### ' + str(pue)+ " [(source)](https://www.cloudcarbonfootprint.org/docs/methodology/)")
elif provider == 'GCP':
pue = 1.1
st.markdown('#### ' + str(pue) + " [(source)](https://www.google.ca/about/datacenters/efficiency/)")
elif provider == 'AZURE':
pue = 1.185
st.markdown('#### ' + str(pue) + " [(source)](https://www.cloudcarbonfootprint.org/docs/methodology/)")
elif provider == 'OVH':
pue = 1.28
st.markdown('#### ' + str(pue) + " [(source)](https://corporate.ovhcloud.com/en-ca/sustainability/environment/)")
elif provider == 'SCALEWAY':
pue = 1.35
st.markdown('#### ' +str(pue) + " [(source)](https://pue.dc3.scaleway.com/en/)")
else:
st.markdown('##### Try to find the PUE of your local infrastructure. Otherwise, you can use the industry average, 1.58:')
pue = st.slider('Total number of GPU hours', value = 1.58)
else:
st.markdown('##### The PUE of the datacenter you used is: '+ str(pue) + ' [(source)]('+source+')')
pue_emissions = round((experimental_emissions+ dynamic_emissions)*pue)
st.metric(label="Your emissions, considering PUE", value=str(pue_emissions)+' kilograms of CO2eq')
st.markdown('### Embodied Emissions ๐ฅ๏ธ๐จ')
st.markdown('##### These are the emissions associated with the materials and processes involved in producing'
' the computing equipment needed for AI models.')
with st.expander("Calculate the embodied emissions of your model"):
st.markdown('##### Based on the number of GPUs and training time you indicated above, we can estimate that your model\'s embodied emissions are approximately: ')
hardware_type = TDP['type'][TDP['name'] == hardware].tolist()[0]
if hardware_type == 'cpu':
embodied_type = embodied_gpu['Value'][embodied_gpu['Ratio']=='Manufacturing emissions per additional CPU (kgCOโeq)'].tolist()[0]
elif hardware_type == 'gpu' or hardware_type == 'tpu':
embodied_type = embodied_gpu['Value'][embodied_gpu['Ratio']=='Manufacturing emissions per additionnal GPU Card (kgCOโeq)'].tolist()[0]
embodied_emissions = round(int(embodied_type)*embodied_conversion_factor*float(num_gpus)*training_time/1000,1)
st.metric(label="Embodied emissions", value=str(embodied_emissions)+' kilograms of CO2eq')
st.info('These are the trickiest emissions to track down since a lot of the information needed is missing. ๐ต '
'We are providing an estimate based on an hourly manufacturing emissions conversion factor [(source)](https://docs.google.com/spreadsheets/d/1DqYgQnEDLQVQm5acMAhLgHLD8xXCG9BIrk-_Nv6jF3k/).')
st.markdown('### Model Information โน๏ธ')
st.markdown('##### If you want to share the link to your model code or paper, please do so below! Otherwise, your submission will be anonymous.')
model_info = st.text_input(label= "Enter a link to your model (optional)")
m = st.markdown("""
<style>
div.stButton > button:first-child {
background-color: rgb(80, 200, 120);
background-image: none;
font-size: 25px;
height: 3em;
width: 15em;
}
</style>""", unsafe_allow_html=True)
buttoncol1, buttoncol2, buttoncol3 = st.columns(3)
with buttoncol2:
if not st.session_state["is_shared"]:
submitted = st.button(label="Share my CO2 data!", on_click = lambda *args: write_to_csv(hardware, gpu_tdp, num_gpus, training_time, provider, carbon_intensity, dynamic_emissions, experimentation_time, experimental_emissions, pue, pue_emissions, embodied_type, embodied_emissions, model_info))
else:
st.info('Thank you! Your data has been shared in https://huggingface.co/datasets/sasha/co2_submissions.')
st.markdown('### Methodology')
with st.expander("More information about our Methodology"):
st.markdown('While there is no universally-accepted approach for assessing the environmental impacts of ML models,'
' we strive towards adopting the widely-used Life Cycle Assessment (LCA) methodology, which aims to cover all stages '
'of the life cycle of a product or process. We focus on the steps for which we do have sufficient information, which range from manufacturing the equipment used '
' for training the model to energy consumption (in green in the figure below).')
st.image('images/LCA_CO2.png', caption='The LCA methodology applied to AI models.')
st.info('We are using all of the available sources of information that we can, and will update the tool as new sources'
' of information are published.')
st.markdown('#### Dynamic and Experimental Emissions')
st.markdown('These are calculated by multiplying the number of GPU hours used by the thermal design power (TDP) of those GPUs and the carbon intensity of the energy grid used to power the hardware.')
st.markdown('#### Datacenter Emissions')
st.markdown('These are calculated by multiplying the dynamic emissions by the PUE (Power Usage Effectiveness) of the datacenter used for running the code.')
st.markdown('#### Embodied Emissions')
st.markdown('These are calculated by dividing the amount of emissions produced during the manufacturing process by the time of use based on a linearly ammortised emissions conversion factor (in our case, 0,0289 [(source)](https://docs.google.com/spreadsheets/d/1DqYgQnEDLQVQm5acMAhLgHLD8xXCG9BIrk-_Nv6jF3k/)).')
st.markdown('#### Example of application')
st.markdown('For an example of this methodology, check out the [BLOOM ๐ธ model CO2 paper](https://arxiv.org/abs/2211.02001)'')')
|