datasets-explorer / clarin_datasets /kpwr_ner_datasets.py
Mariusz Kossakowski
Black formatting
f10673c
raw
history blame
6.1 kB
import pandas as pd
from datasets import load_dataset
import streamlit as st
from clarin_datasets.dataset_to_show import DatasetToShow
class KpwrNerDataset(DatasetToShow):
def __init__(self):
DatasetToShow.__init__(self)
self.data_dict_named = None
self.dataset_name = "clarin-pl/kpwr-ner"
self.description = """
KPWR-NER is a part the Polish Corpus of Wrocław University of Technology (Korpus Języka
Polskiego Politechniki Wrocławskiej). Its objective is named entity recognition for fine-grained categories
of entities. It is the ‘n82’ version of the KPWr, which means that number of classes is restricted to 82 (
originally 120). During corpus creation, texts were annotated by humans from various sources, covering many
domains and genres.
Tasks (input, output and metrics)
Named entity recognition (NER) - tagging entities in text with their corresponding type.
Input ('tokens' column): sequence of tokens
Output ('ner' column): sequence of predicted tokens’ classes in BIO notation (82 possible classes, described
in detail in the annotation guidelines)
example:
[‘Roboty’, ‘mają’, ‘kilkanaście’, ‘lat’, ‘i’, ‘pochodzą’, ‘z’, ‘USA’, ‘,’, ‘Wysokie’, ‘napięcie’, ‘jest’,
‘dużo’, ‘młodsze’, ‘,’, ‘powstało’, ‘w’, ‘Niemczech’, ‘.’] → [‘B-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
‘O’, ‘B-nam_loc_gpe_country’, ‘O’, ‘B-nam_pro_title’, ‘I-nam_pro_title’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’, ‘O’,
‘B-nam_loc_gpe_country’, ‘O’]
"""
def load_data(self):
raw_dataset = load_dataset(self.dataset_name)
self.data_dict = {
subset: raw_dataset[subset].to_pandas() for subset in self.subsets
}
self.data_dict_named = {}
for subset in self.subsets:
references = raw_dataset[subset]["ner"]
references_named = [
[
raw_dataset[subset].features["ner"].feature.names[label]
for label in labels
]
for labels in references
]
self.data_dict_named[subset] = pd.DataFrame(
{
"tokens": self.data_dict[subset]["tokens"],
"ner": references_named,
}
)
def show_dataset(self):
header = st.container()
description = st.container()
dataframe_head = st.container()
class_distribution = st.container()
most_common_tokens = st.container()
with header:
st.title(self.dataset_name)
with description:
st.header("Dataset description")
st.write(self.description)
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
tokens_all = full_dataframe["tokens"].tolist()
tokens_all = [x for subarray in tokens_all for x in subarray]
labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[
"ner"
].tolist()
labels_all = [x for subarray in labels_all for x in subarray]
with dataframe_head:
df_to_show = full_dataframe.head(10)
st.header("First 10 observations of the dataset")
st.dataframe(df_to_show)
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
class_distribution_dict = {}
for subset in self.subsets:
all_labels_from_subset = self.data_dict_named[subset]["ner"].tolist()
all_labels_from_subset = [
x
for subarray in all_labels_from_subset
for x in subarray
if x != "O" and not x.startswith("I-")
]
all_labels_from_subset = pd.Series(all_labels_from_subset)
class_distribution_dict[subset] = (
all_labels_from_subset.value_counts(normalize=True)
.sort_index()
.reset_index()
.rename({"index": "class", 0: subset}, axis="columns")
)
class_distribution_df = pd.merge(
class_distribution_dict["train"],
class_distribution_dict["test"],
on="class",
)
with class_distribution:
st.header("Class distribution in each subset (without 'O' and 'I-*')")
st.dataframe(class_distribution_df)
st.text_area(
label="LaTeX code", value=class_distribution_df.style.to_latex()
)
# Most common tokens from selected class (without 0)
full_df_unzipped = pd.DataFrame(
{
"token": tokens_all,
"ner": labels_all,
}
)
full_df_unzipped = full_df_unzipped.loc[
(full_df_unzipped["ner"] != "O")
& (full_df_unzipped["ner"].str.starstwith("I-"))
]
possible_options = sorted(full_df_unzipped["ner"].unique())
with most_common_tokens:
st.header("10 most common tokens from selected class (without 'O')")
selected_class = st.selectbox(
label="Select class to show", options=possible_options
)
df_to_show = (
full_df_unzipped.loc[full_df_unzipped["ner"] == selected_class]
.groupby(["token"])
.count()
.reset_index()
.rename({"ner": "no_of_occurrences"}, axis=1)
.sort_values(by="no_of_occurrences", ascending=False)
.reset_index(drop=True)
.head(10)
)
st.dataframe(df_to_show)
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())