Mariusz Kossakowski commited on
Commit
494bd12
·
1 Parent(s): 1358c21

Add tokens count to aspectemo

Browse files
clarin_datasets/aspectemo_dataset.py CHANGED
@@ -49,6 +49,7 @@ class AspectEmoDataset(DatasetToShow):
49
  description = st.container()
50
  dataframe_head = st.container()
51
  class_distribution = st.container()
 
52
 
53
  with header:
54
  st.title(self.dataset_name)
@@ -58,6 +59,11 @@ class AspectEmoDataset(DatasetToShow):
58
  st.write(self.description)
59
 
60
  full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
 
 
 
 
 
61
  with dataframe_head:
62
  df_to_show = full_dataframe.head(10)
63
  st.header("First 10 observations of the dataset")
@@ -66,14 +72,16 @@ class AspectEmoDataset(DatasetToShow):
66
 
67
  class_distribution_dict = {}
68
  for subset in self.subsets:
69
- all_labels = self.data_dict[subset]["labels"].tolist()
70
- all_labels = [x for subarray in all_labels for x in subarray if x != 0]
71
- all_labels = pd.Series(all_labels)
 
 
72
  class_distribution_dict[subset] = (
73
- all_labels.value_counts(normalize=True)
74
- .sort_index()
75
- .reset_index()
76
- .rename({"index": "class", 0: subset}, axis="columns")
77
  )
78
 
79
  class_distribution_df = pd.merge(
@@ -84,4 +92,33 @@ class AspectEmoDataset(DatasetToShow):
84
  with class_distribution:
85
  st.header("Class distribution in each subset (without '0')")
86
  st.dataframe(class_distribution_df)
87
- st.text_area(label="LaTeX code", value=class_distribution_df.style.to_latex())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  description = st.container()
50
  dataframe_head = st.container()
51
  class_distribution = st.container()
52
+ most_common_tokens = st.container()
53
 
54
  with header:
55
  st.title(self.dataset_name)
 
59
  st.write(self.description)
60
 
61
  full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
62
+ tokens_all = full_dataframe["tokens"].tolist()
63
+ tokens_all = [x for subarray in tokens_all for x in subarray]
64
+ labels_all = full_dataframe["labels"].tolist()
65
+ labels_all = [x for subarray in labels_all for x in subarray]
66
+
67
  with dataframe_head:
68
  df_to_show = full_dataframe.head(10)
69
  st.header("First 10 observations of the dataset")
 
72
 
73
  class_distribution_dict = {}
74
  for subset in self.subsets:
75
+ all_labels_from_subset = self.data_dict[subset]["labels"].tolist()
76
+ all_labels_from_subset = [
77
+ x for subarray in all_labels_from_subset for x in subarray if x != 0
78
+ ]
79
+ all_labels_from_subset = pd.Series(all_labels_from_subset)
80
  class_distribution_dict[subset] = (
81
+ all_labels_from_subset.value_counts(normalize=True)
82
+ .sort_index()
83
+ .reset_index()
84
+ .rename({"index": "class", 0: subset}, axis="columns")
85
  )
86
 
87
  class_distribution_df = pd.merge(
 
92
  with class_distribution:
93
  st.header("Class distribution in each subset (without '0')")
94
  st.dataframe(class_distribution_df)
95
+ st.text_area(
96
+ label="LaTeX code", value=class_distribution_df.style.to_latex()
97
+ )
98
+
99
+ # Most common tokens from selected class (without 0)
100
+ full_df_unzipped = pd.DataFrame(
101
+ {
102
+ "token": tokens_all,
103
+ "label": labels_all,
104
+ }
105
+ )
106
+ full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["label"] != 0]
107
+ possible_options = sorted(full_df_unzipped["label"].unique())
108
+ with most_common_tokens:
109
+ st.header("10 most common tokens from selected class (without '0')")
110
+ selected_class = st.selectbox(
111
+ label="Select class to show", options=possible_options
112
+ )
113
+ df_to_show = (
114
+ full_df_unzipped.loc[full_df_unzipped["label"] == selected_class]
115
+ .groupby(["token"])
116
+ .count()
117
+ .reset_index()
118
+ .rename({"label": "no_of_occurrences"}, axis=1)
119
+ .sort_values(by="no_of_occurrences", ascending=False)
120
+ .reset_index(drop=True)
121
+ .head(10)
122
+ )
123
+ st.dataframe(df_to_show)
124
+ st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())