Spaces:
Runtime error
Runtime error
import pandas as pd | |
from datasets import load_dataset | |
import streamlit as st | |
from clarin_datasets.dataset_to_show import DatasetToShow | |
class KpwrNerDataset(DatasetToShow): | |
def __init__(self): | |
DatasetToShow.__init__(self) | |
self.data_dict_named = None | |
self.dataset_name = "clarin-pl/kpwr-ner" | |
self.description = """ | |
KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka | |
Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories | |
of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 ( | |
originally 120). During corpus creation, texts were annotated by humans from various sources, covering many | |
domains and genres. | |
Tasks (input, output and metrics) | |
Named entity recognition (NER) - tagging entities in text with their corresponding type. | |
Input ('tokens' column): sequence of tokens | |
Output ('ner' column): sequence of predicted tokens’ classes in BIO notation (82 possible classes, described | |
in detail in the annotation guidelines) | |
example: | |
[‘Roboty’, ‘mają’, ‘kilkanaście’, ‘lat’, ‘i’, ‘pochodzą’, ‘z’, ‘USA’, ‘,’, ‘Wysokie’, ‘napięcie’, ‘jest’, | |
‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, | |
‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, | |
‘B-nam_loc_gpe_country’, ‘O’] | |
""" | |
def load_data(self): | |
raw_dataset = load_dataset(self.dataset_name) | |
self.data_dict = { | |
subset: raw_dataset[subset].to_pandas() for subset in self.subsets | |
} | |
self.data_dict_named = {} | |
for subset in self.subsets: | |
references = raw_dataset[subset]["ner"] | |
references_named = [ | |
[ | |
raw_dataset[subset].features["ner"].feature.names[label] | |
for label in labels | |
] | |
for labels in references | |
] | |
self.data_dict_named[subset] = pd.DataFrame( | |
{ | |
"tokens": self.data_dict[subset]["tokens"], | |
"ner": references_named, | |
} | |
) | |
def show_dataset(self): | |
header = st.container() | |
description = st.container() | |
dataframe_head = st.container() | |
class_distribution = st.container() | |
most_common_tokens = st.container() | |
with header: | |
st.title(self.dataset_name) | |
with description: | |
st.header("Dataset description") | |
st.write(self.description) | |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows") | |
tokens_all = full_dataframe["tokens"].tolist() | |
tokens_all = [x for subarray in tokens_all for x in subarray] | |
labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[ | |
"ner" | |
].tolist() | |
labels_all = [x for subarray in labels_all for x in subarray] | |
with dataframe_head: | |
df_to_show = full_dataframe.head(10) | |
st.header("First 10 observations of the dataset") | |
st.dataframe(df_to_show) | |
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex()) | |
class_distribution_dict = {} | |
for subset in self.subsets: | |
all_labels_from_subset = self.data_dict_named[subset]["ner"].tolist() | |
all_labels_from_subset = [ | |
x | |
for subarray in all_labels_from_subset | |
for x in subarray | |
if x != "O" and not x.startswith("I-") | |
] | |
all_labels_from_subset = pd.Series(all_labels_from_subset) | |
class_distribution_dict[subset] = ( | |
all_labels_from_subset.value_counts(normalize=True) | |
.sort_index() | |
.reset_index() | |
.rename({"index": "class", 0: subset}, axis="columns") | |
) | |
class_distribution_df = pd.merge( | |
class_distribution_dict["train"], | |
class_distribution_dict["test"], | |
on="class", | |
) | |
with class_distribution: | |
st.header("Class distribution in each subset (without 'O' and 'I-*')") | |
st.dataframe(class_distribution_df) | |
st.text_area( | |
label="LaTeX code", value=class_distribution_df.style.to_latex() | |
) | |
# Most common tokens from selected class (without 0) | |
full_df_unzipped = pd.DataFrame( | |
{ | |
"token": tokens_all, | |
"ner": labels_all, | |
} | |
) | |
full_df_unzipped = full_df_unzipped.loc[ | |
(full_df_unzipped["ner"] != "O") | |
& (full_df_unzipped["ner"].str.starstwith("I-")) | |
] | |
possible_options = sorted(full_df_unzipped["ner"].unique()) | |
with most_common_tokens: | |
st.header("10 most common tokens from selected class (without 'O')") | |
selected_class = st.selectbox( | |
label="Select class to show", options=possible_options | |
) | |
df_to_show = ( | |
full_df_unzipped.loc[full_df_unzipped["ner"] == selected_class] | |
.groupby(["token"]) | |
.count() | |
.reset_index() | |
.rename({"ner": "no_of_occurrences"}, axis=1) | |
.sort_values(by="no_of_occurrences", ascending=False) | |
.reset_index(drop=True) | |
.head(10) | |
) | |
st.dataframe(df_to_show) | |
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex()) | |