datasets-explorer / clarin_datasets /cst_wikinews_dataset.py
Mariusz Kossakowski
Add datasets links
2b9022f
raw
history blame
1.91 kB
import pandas as pd
from datasets import load_dataset
import streamlit as st
from clarin_datasets.dataset_to_show import DatasetToShow
class CSTWikinewsDataset(DatasetToShow):
def __init__(self):
DatasetToShow.__init__(self)
self.dataset_name = "clarin-pl/cst-wikinews"
self.description = f"""
Dataset link: https://huggingface.co/datasets/{self.dataset_name}
"""
def load_data(self):
raw_dataset = load_dataset(self.dataset_name)
self.data_dict = {
subset: raw_dataset[subset].to_pandas() for subset in self.subsets
}
def show_dataset(self):
header = st.container()
dataframe_head = st.container()
class_distribution = st.container()
with header:
st.title(self.dataset_name)
with dataframe_head:
st.header("First 10 observations of the chosen subset")
subset_to_show = st.selectbox(
label="Select subset to see", options=self.subsets
)
df_to_show = self.data_dict[subset_to_show].head(10)
st.dataframe(df_to_show)
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
class_distribution_df = pd.merge(
pd.DataFrame(
self.data_dict["train"]["label"]
.value_counts(normalize=True)
.reset_index(drop=False)
.rename({"index": "class"}, axis="columns")
),
pd.DataFrame(
self.data_dict["test"]["label"]
.value_counts(normalize=True)
.reset_index(drop=False)
.rename({"index": "class"}, axis="columns")
),
on="class",
).rename({"label_x": "train", "label_y": "test"}, axis="columns")
with class_distribution:
st.dataframe(class_distribution_df)