File size: 3,291 Bytes
77405f7
 
9f7f573
 
77405f7
9f7f573
 
 
77405f7
9f7f573
 
 
 
 
 
2b9022f
 
9f7f573
 
 
90966f7
 
 
 
9f7f573
 
08bbbaf
 
997a159
77405f7
08bbbaf
 
 
 
 
997a159
 
 
08bbbaf
 
 
997a159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77405f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset
from sklearn.manifold import TSNE
import streamlit as st

from clarin_datasets.dataset_to_show import DatasetToShow
from clarin_datasets.utils import embed_sentence, PLOT_COLOR_PALETTE


class CSTWikinewsDataset(DatasetToShow):
    def __init__(self):
        DatasetToShow.__init__(self)
        self.dataset_name = "clarin-pl/cst-wikinews"
        self.description = f"""
        Dataset link: https://huggingface.co/datasets/{self.dataset_name}
        """

    def load_data(self):
        raw_dataset = load_dataset(self.dataset_name)
        self.data_dict = {
            subset: raw_dataset[subset].to_pandas() for subset in self.subsets
        }

    def show_dataset(self):
        header = st.container()
        dataframe_head = st.container()
        class_distribution = st.container()
        tsne_projection = st.container()
        with header:
            st.title(self.dataset_name)

        with dataframe_head:
            st.header("First 10 observations of the chosen subset")
            subset_to_show = st.selectbox(
                label="Select subset to see", options=self.subsets
            )
            df_to_show = self.data_dict[subset_to_show].head(10)
            st.dataframe(df_to_show)
            st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())

        class_distribution_df = pd.merge(
            pd.DataFrame(
                self.data_dict["train"]["label"]
                .value_counts(normalize=True)
                .reset_index(drop=False)
                .rename({"index": "class"}, axis="columns")
            ),
            pd.DataFrame(
                self.data_dict["test"]["label"]
                .value_counts(normalize=True)
                .reset_index(drop=False)
                .rename({"index": "class"}, axis="columns")
            ),
            on="class",
        ).rename({"label_x": "train", "label_y": "test"}, axis="columns")

        with class_distribution:
            st.dataframe(class_distribution_df)

        with tsne_projection:
            st.header("t-SNE projection of the dataset")
            subset_to_project = st.selectbox(
                label="Select subset to project", options=self.subsets
            )
            first_sentences = self.data_dict[subset_to_project]["sentence_1"].values
            second_sentences = self.data_dict[subset_to_project]["sentence_2"].values
            labels = self.data_dict[subset_to_project]["label"].values
            first_sentences_embedded = np.array([embed_sentence(x) for x in first_sentences])
            second_sentences_embedded = np.array([embed_sentence(x) for x in second_sentences])
            mean_embeddings = (first_sentences_embedded + second_sentences_embedded) / 2
            reducer = TSNE(
                n_components=2
            )
            transformed_embeddings = reducer.fit_transform(mean_embeddings)
            fig, ax = plt.subplots()
            ax.scatter(
                x=transformed_embeddings[:, 0],
                y=transformed_embeddings[:, 1],
                c=[
                    PLOT_COLOR_PALETTE[i] for i in labels
                ]
            )
            st.pyplot(fig)