Spaces:
Runtime error
Runtime error
import json | |
import streamlit as st | |
from datasets import load_dataset | |
from streamlit_folium import folium_static | |
from catalogue import make_choro_map, region_tree | |
################## | |
## streamlit | |
################## | |
st.set_page_config( | |
page_title="BigScience Language Resource Catalogue Input Form", | |
page_icon="https://avatars.githubusercontent.com/u/82455566", | |
layout="wide", | |
initial_sidebar_state="auto", | |
) | |
query_params = st.experimental_get_query_params() | |
def main(): | |
if "save_state" not in st.session_state: | |
st.session_state.save_state = {} | |
viz_page() | |
################## | |
## SECTION: Explore the current catalogue | |
################## | |
app_categories = { | |
"entry_types": { | |
"primary": "Primary source", | |
"processed": "Processed language dataset", | |
"organization": "Language organization or advocate", | |
}, | |
"language_lists": json.load( | |
open("resources/language_lists.json", encoding="utf-8") | |
), | |
"programming_languages": [ | |
x | |
for x in json.load( | |
open("resources/programming_languages.json", encoding="utf-8") | |
)["itemListElement"] | |
], | |
"languages_bcp47": [ | |
x | |
for x in json.load(open("resources/bcp47.json", encoding="utf-8"))["subtags"] | |
if x["type"] == "language" | |
], | |
"custodian_types": [ | |
"A private individual", | |
"A commercial entity", | |
"A library, museum, or archival institute", | |
"A university or research institution", | |
"A nonprofit/NGO (other)", | |
"A government organization", | |
], | |
"pii_categories": json.load( | |
open("resources/pii_categories.json", encoding="utf-8") | |
), | |
"licenses": json.load(open("resources/licenses.json", encoding="utf-8")), | |
"primary_taxonomy": json.load( | |
open("resources/primary_source_taxonomy.json", encoding="utf-8") | |
), | |
"file_formats": json.load(open("resources/file_formats.json", encoding="utf-8")), | |
} | |
def filter_entry(entry, filter_dct): | |
res = True | |
for k, v in entry.items(): | |
if k in filter_dct: | |
if isinstance(v, dict): | |
res = res and filter_entry(v, filter_dct[k]) | |
elif isinstance(v, list): | |
res = res and ( | |
len(filter_dct[k]) == 0 or any([e in filter_dct[k] for e in v]) | |
) | |
else: | |
res = res and (len(filter_dct[k]) == 0 or v in filter_dct[k]) | |
return res | |
def filter_catalogue_visualization(catalogue, options): | |
st.markdown("### Select entries to visualize") | |
st.markdown( | |
"##### Select entries by category, language, type of custodian or media" | |
) | |
st.markdown( | |
"You can select specific parts of the catalogue to visualize in this window." | |
+ " Leave a field empty to select all values, or select specific options to only select entries that have one of the chosen values." | |
) | |
filter_by_options = [ | |
"resource type", | |
"language names", | |
"custodian type", | |
"available for download", | |
"license type", | |
"source type", | |
"media type", | |
] | |
filter_by = st.multiselect( | |
key="viz_filter_by", | |
label="You can filter the catalogue to only visualize entries that have certain properties, such as:", | |
options=filter_by_options, | |
) | |
filter_dict = {} | |
if "resource type" in filter_by: | |
filter_dict["type"] = st.multiselect( | |
key="viz_filter_type", | |
label="I want to only see entries that are of the following category:", | |
options=options["entry_types"], | |
format_func=lambda x: options["entry_types"][x], | |
) | |
if "language names" in filter_by: | |
filter_dict["languages"] = {} | |
filter_dict["languages"]["language_names"] = st.multiselect( | |
key="viz_filter_languages_language_names", | |
label="I want to only see entries that have one of the following languages:", | |
options=list(options["language_lists"]["language_groups"].keys()) | |
+ options["language_lists"]["niger_congo_languages"] | |
+ options["language_lists"]["indic_languages"], | |
) | |
if "custodian type" in filter_by: | |
filter_dict["custodian"] = {} | |
filter_dict["custodian"]["type"] = st.multiselect( | |
key="viz_filter_custodian_type", | |
label="I want to only see entries that corresponds to organizations or to data that id owned/managed by organizations of the following types:", | |
options=options["custodian_types"], | |
) | |
if "available for download" in filter_by: | |
filter_dict["availability"] = filter_dict.get("availability", {}) | |
filter_dict["availability"]["procurement"] = {} | |
download_options = [ | |
"No - but the current owners/custodians have contact information for data queries", | |
"No - we would need to spontaneously reach out to the current owners/custodians", | |
"Yes - it has a direct download link or links", | |
"Yes - after signing a user agreement", | |
] | |
filter_dict["availability"]["procurement"]["for_download"] = st.multiselect( | |
key="viz_availability_procurement_for_download", | |
label="Select based on whether the data can be obtained online:", | |
options=download_options, | |
) | |
if "license type" in filter_by: | |
filter_dict["availability"] = filter_dict.get("availability", {}) | |
filter_dict["availability"]["licensing"] = {} | |
filter_dict["availability"]["licensing"]["license_properties"] = st.multiselect( | |
key="viz_availability_licensing_license_properties", | |
label="Select primary entries that have the following license types", | |
options=[ | |
"public domain", | |
"multiple licenses", | |
"copyright - all rights reserved", | |
"open license", | |
"research use", | |
"non-commercial use", | |
"do not distribute", | |
], | |
) | |
primary_license_options = [ | |
"Unclear / I don't know", | |
"Yes - the source material has an open license that allows re-use", | |
"Yes - the dataset has the same license as the source material", | |
"Yes - the dataset curators have obtained consent from the source material owners", | |
"No - the license of the source material actually prohibits re-use in this manner", | |
] | |
filter_dict["processed_from_primary"] = filter_dict.get( | |
"processed_from_primary", {} | |
) | |
filter_dict["processed_from_primary"]["primary_license"] = st.multiselect( | |
key="viz_processed_from_primary_primary_license", | |
label="For datasets, selected based on: Is the license or commercial status of the source material compatible with the license of the dataset?", | |
options=primary_license_options, | |
) | |
if "source type" in filter_by: | |
filter_dict["source_category"] = {} | |
filter_dict["source_category"]["category_type"] = st.multiselect( | |
key="viz_source_category_category_type", | |
label="Select primary sources that correspond to:", | |
options=["collection", "website"], | |
) | |
filter_dict["source_category"]["category_web"] = st.multiselect( | |
key="viz_source_category_category_web", | |
label="Select web-based primary sources that contain:", | |
options=options["primary_taxonomy"]["website"], | |
) | |
filter_dict["source_category"]["category_media"] = st.multiselect( | |
key="viz_source_category_category_media", | |
label="Select primary sources that are collections of:", | |
options=options["primary_taxonomy"]["collection"], | |
) | |
filter_dict["processed_from_primary"] = filter_dict.get( | |
"processed_from_primary", {} | |
) | |
filter_dict["processed_from_primary"]["primary_types"] = st.multiselect( | |
key="viz_processed_from_primary_primary_types", | |
label="Select processed datasets whose primary sources contain:", | |
options=[f"web | {w}" for w in options["primary_taxonomy"]["website"]] | |
+ options["primary_taxonomy"]["collection"], | |
) | |
if "media type" in filter_by: | |
filter_dict["media"] = {} | |
filter_dict["media"]["category"] = st.multiselect( | |
key="viz_media_category", | |
label="Select language data resources that contain:", | |
options=["text", "audiovisual", "image"], | |
help="Media data provided with transcription should go into **text**, then select the *transcribed* option. PDFs that have pre-extracted text information should go into **text**, PDFs that need OCR should go into **images**, select the latter if you're unsure", | |
) | |
filtered_catalogue = [ | |
entry | |
for entry in catalogue | |
if filter_entry(entry, filter_dict) and not (entry["uid"] == "") | |
] | |
st.markdown( | |
f"##### Your query matched **{len(filtered_catalogue)}** entries in the current catalogue." | |
) | |
return filtered_catalogue | |
def viz_page(): | |
st.title("🌸 - BigScience Catalog of Language Resources") | |
st.markdown("---\n") | |
catalogue = load_dataset("bigscience/collaborative_catalog")["train"] | |
with st.sidebar: | |
filtered_catalogue = filter_catalogue_visualization(catalogue, app_categories) | |
entry_location_type = st.radio( | |
label="I want to visualize", | |
options=[ | |
"Where the organizations or data custodians are located", | |
"Where the language data creators are located", | |
], | |
key="viz_show_location_type", | |
) | |
show_by_org = ( | |
entry_location_type | |
== "Where the organizations or data custodians are located" | |
) | |
with st.expander("Map of entries", expanded=True): | |
filtered_counts = {} | |
for entry in filtered_catalogue: | |
locations = ( | |
[entry["custodian"]["location"]] | |
if show_by_org | |
else entry["languages"]["language_locations"] | |
) | |
# be as specific as possible | |
locations = [ | |
loc | |
for loc in locations | |
if not any([l in region_tree.get(loc, []) for l in locations]) | |
] | |
for loc in locations: | |
filtered_counts[loc] = filtered_counts.get(loc, 0) + 1 | |
world_map = make_choro_map(filtered_counts) | |
folium_static(world_map, width=900, height=600) | |
with st.expander("View selected resources", expanded=False): | |
st.write("You can further select locations to select entries from here:") | |
filter_region_choices = sorted( | |
set( | |
[ | |
loc | |
for entry in filtered_catalogue | |
for loc in ( | |
[entry["custodian"]["location"]] | |
if show_by_org | |
else entry["languages"]["language_locations"] | |
) | |
] | |
) | |
) | |
filter_locs = st.multiselect( | |
"View entries from the following locations:", | |
options=filter_region_choices, | |
key="viz_select_location", | |
) | |
filter_loc_dict = ( | |
{"custodian": {"location": filter_locs}} | |
if show_by_org | |
else {"languages": {"language_locations": filter_locs}} | |
) | |
filtered_catalogue_by_loc = [ | |
entry | |
for entry in filtered_catalogue | |
if filter_entry(entry, filter_loc_dict) | |
] | |
view_entry = st.selectbox( | |
label="Select an entry to see more detail:", | |
options=filtered_catalogue_by_loc, | |
format_func=lambda entry: f"{entry['uid']} | {entry['description']['name']} -- {entry['description']['description']}", | |
key="viz_select_entry", | |
) | |
st.markdown( | |
f"##### *Type:* {view_entry['type']} *UID:* {view_entry['uid']} - *Name:* {view_entry['description']['name']}\n\n{view_entry['description']['description']}" | |
) | |
st.write(view_entry) | |
if __name__ == "__main__": | |
main() | |