Spaces:

clarin-pl
/

datasets-explorer

Runtime error

App Files Files Community

Mariusz Kossakowski commited on Aug 29, 2022

Commit

494bd12

1 Parent(s): 1358c21

Add tokens count to aspectemo

Browse files

Files changed (1) hide show

clarin_datasets/aspectemo_dataset.py +45 -8

clarin_datasets/aspectemo_dataset.py CHANGED Viewed

@@ -49,6 +49,7 @@ class AspectEmoDataset(DatasetToShow):
         description = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
         with header:
             st.title(self.dataset_name)
@@ -58,6 +59,11 @@ class AspectEmoDataset(DatasetToShow):
             st.write(self.description)
         full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
         with dataframe_head:
             df_to_show = full_dataframe.head(10)
             st.header("First 10 observations of the dataset")
@@ -66,14 +72,16 @@ class AspectEmoDataset(DatasetToShow):
         class_distribution_dict = {}
         for subset in self.subsets:
-            all_labels = self.data_dict[subset]["labels"].tolist()
-            all_labels = [x for subarray in all_labels for x in subarray if x != 0]
-            all_labels = pd.Series(all_labels)
             class_distribution_dict[subset] = (
-                all_labels.value_counts(normalize=True)
-                .sort_index()
-                .reset_index()
-                .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(
@@ -84,4 +92,33 @@ class AspectEmoDataset(DatasetToShow):
         with class_distribution:
             st.header("Class distribution in each subset (without '0')")
             st.dataframe(class_distribution_df)
-            st.text_area(label="LaTeX code", value=class_distribution_df.style.to_latex())

         description = st.container()
         dataframe_head = st.container()
         class_distribution = st.container()
+        most_common_tokens = st.container()
         with header:
             st.title(self.dataset_name)
             st.write(self.description)
         full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
+        tokens_all = full_dataframe["tokens"].tolist()
+        tokens_all = [x for subarray in tokens_all for x in subarray]
+        labels_all = full_dataframe["labels"].tolist()
+        labels_all = [x for subarray in labels_all for x in subarray]
         with dataframe_head:
             df_to_show = full_dataframe.head(10)
             st.header("First 10 observations of the dataset")
         class_distribution_dict = {}
         for subset in self.subsets:
+            all_labels_from_subset = self.data_dict[subset]["labels"].tolist()
+            all_labels_from_subset = [
+                x for subarray in all_labels_from_subset for x in subarray if x != 0
+            ]
+            all_labels_from_subset = pd.Series(all_labels_from_subset)
             class_distribution_dict[subset] = (
+                all_labels_from_subset.value_counts(normalize=True)
+                    .sort_index()
+                    .reset_index()
+                    .rename({"index": "class", 0: subset}, axis="columns")
             )
         class_distribution_df = pd.merge(
         with class_distribution:
             st.header("Class distribution in each subset (without '0')")
             st.dataframe(class_distribution_df)
+            st.text_area(
+                label="LaTeX code", value=class_distribution_df.style.to_latex()
+            )
+        # Most common tokens from selected class (without 0)
+        full_df_unzipped = pd.DataFrame(
+            {
+                "token": tokens_all,
+                "label": labels_all,
+            }
+        )
+        full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["label"] != 0]
+        possible_options = sorted(full_df_unzipped["label"].unique())
+        with most_common_tokens:
+            st.header("10 most common tokens from selected class (without '0')")
+            selected_class = st.selectbox(
+                label="Select class to show", options=possible_options
+            )
+            df_to_show = (
+                full_df_unzipped.loc[full_df_unzipped["label"] == selected_class]
+                .groupby(["token"])
+                .count()
+                .reset_index()
+                .rename({"label": "no_of_occurrences"}, axis=1)
+                .sort_values(by="no_of_occurrences", ascending=False)
+                .reset_index(drop=True)
+                .head(10)
+            )
+            st.dataframe(df_to_show)
+            st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())