Spaces:
Runtime error
Runtime error
import logging | |
from os import mkdir | |
from os.path import isdir | |
from os.path import join as pjoin | |
from pathlib import Path | |
import streamlit as st | |
from data_measurements_clusters import Clustering | |
title = "Dataset Exploration" | |
description = "Comparison of hate speech detection datasets" | |
date = "2022-01-26" | |
thumbnail = "images/books.png" | |
__COLLECT = """ | |
In order to turn observations of the world into data, choices must be made | |
about what counts as data, where to collect data, and how to collect data. | |
When collecting language data, this often means selecting websites that allow | |
for easily collecting samples of text, and hate speech data is frequently | |
collected from social media platforms like Twitter or forums like Wikipedia. | |
Each of these decisions results in a specific sample of all the possible | |
observations. | |
""" | |
__ANNOTATE = """ | |
Once the data is collected, further decisions must be made about how to | |
label the data if the data is being used to train a classification system, | |
as is common in hate speech detection. These labels must be defined in order | |
for the dataset to be consistently labeled, which helps the classification | |
model produce more consistent output. This labeling process, called | |
*annotation*, can be done by the data collectors, by a set of trained | |
annotators with relevant expert knowledge, or by online crowdworkers. Who | |
is doing the annotating has a significant effect on the resulting set of | |
labels ([Sap et al., 2019](https://aclanthology.org/P19-1163.pdf)). | |
""" | |
__STANDARDIZE = """ | |
As a relatively new task in NLP, the definitions that are used across | |
different projects vary. Some projects target just hate speech, but others | |
may label their data for ‘toxic’, ‘offensive’, or ‘abusive’ language. Still | |
others may address related problems such as bullying and harassment. | |
This variation makes it difficult to compare across datasets and their | |
respective models. As these modeling paradigms become more established, | |
definitions grounded in relevant sociological research will need to be | |
agreed upon in order for datasets and models in ACM to appropriately | |
capture the problems in the world that they set out to address. For more | |
on this discussion, see | |
[Madukwe et al 2020](https://aclanthology.org/2020.alw-1.18.pdf) and | |
[Fortuna et al 2020](https://aclanthology.org/2020.lrec-1.838.pdf). | |
""" | |
__HOW_TO = """ | |
To use the tool, select a dataset. The tool will then show clusters of | |
examples in the dataset that have been automatically determined to be similar | |
to one another. Below that, you can see specific examples within the cluster, | |
the labels for those examples, and the distribution of labels within the | |
cluster. Note that cluster 0 will always be the full dataset. | |
""" | |
DSET_OPTIONS = { | |
"classla/FRENK-hate-en": { | |
"binary": { | |
"train": { | |
("text",): { | |
"label": { | |
100000: { | |
"sentence-transformers/all-mpnet-base-v2": { | |
"tree": { | |
"dataset_name": "classla/FRENK-hate-en", | |
"config_name": "binary", | |
"split_name": "train", | |
"input_field_path": ("text",), | |
"label_name": "label", | |
"num_rows": 100000, | |
"model_name": "sentence-transformers/all-mpnet-base-v2", | |
"file_name": "tree", | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
}, | |
"tweets_hate_speech_detection": { | |
"default": { | |
"train": { | |
("tweet",): { | |
"label": { | |
100000: { | |
"sentence-transformers/all-mpnet-base-v2": { | |
"tree": { | |
"dataset_name": "tweets_hate_speech_detection", | |
"config_name": "default", | |
"split_name": "train", | |
"input_field_path": ("tweet",), | |
"label_name": "label", | |
"num_rows": 100000, | |
"model_name": "sentence-transformers/all-mpnet-base-v2", | |
"file_name": "tree", | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
}, | |
"ucberkeley-dlab/measuring-hate-speech": { | |
"default": { | |
"train": { | |
("text",): { | |
"hatespeech": { | |
100000: { | |
"sentence-transformers/all-mpnet-base-v2": { | |
"tree": { | |
"dataset_name": "ucberkeley-dlab/measuring-hate-speech", | |
"config_name": "default", | |
"split_name": "train", | |
"input_field_path": ("text",), | |
"label_name": "hatespeech", | |
"num_rows": 100000, | |
"model_name": "sentence-transformers/all-mpnet-base-v2", | |
"file_name": "tree", | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
}, | |
} | |
def download_tree(args): | |
clusters = Clustering(**args) | |
return clusters | |
def run_article(): | |
st.markdown("# Making a Hate Speech Dataset") | |
st.markdown("## Collecting observations of the world") | |
with st.expander("Collection"): | |
st.markdown(__COLLECT, unsafe_allow_html=True) | |
st.markdown("## Annotating observations with task labels") | |
with st.expander("Annotation"): | |
st.markdown(__ANNOTATE, unsafe_allow_html=True) | |
st.markdown("## Standardizing the task") | |
with st.expander("Standardization"): | |
st.markdown(__STANDARDIZE, unsafe_allow_html=True) | |
st.markdown("# Exploring datasets") | |
with st.expander("How to use the tool"): | |
st.markdown(__HOW_TO, unsafe_allow_html=True) | |
choose_dset = st.selectbox( | |
"Select dataset to visualize", | |
DSET_OPTIONS, | |
) | |
pre_args = DSET_OPTIONS[choose_dset] | |
args = pre_args | |
while not "dataset_name" in args: | |
args = list(args.values())[0] | |
clustering = download_tree(args) | |
st.markdown("---\n") | |
full_tree_fig = clustering.get_full_tree() | |
st.plotly_chart(full_tree_fig, use_container_width=True) | |
st.markdown("---\n") | |
show_node = st.selectbox( | |
"Visualize cluster node:", | |
range(len(clustering.node_list)), | |
) | |
st.markdown( | |
f"Node {show_node} has {clustering.node_list[show_node]['weight']} examples." | |
) | |
st.markdown( | |
f"Node {show_node} was merged at {clustering.node_list[show_node]['merged_at']:.2f}." | |
) | |
examplars = clustering.get_node_examplars(show_node) | |
st.markdown("---\n") | |
label_fig = clustering.get_node_label_chart(show_node) | |
examplars_col, labels_col = st.columns([2, 1]) | |
examplars_col.markdown("#### Node cluster examplars") | |
examplars_col.table(examplars) | |
labels_col.markdown("#### Node cluster labels") | |
labels_col.plotly_chart(label_fig, use_container_width=True) | |