Spaces:
Runtime error
Runtime error
Mariusz Kossakowski
commited on
Commit
·
494bd12
1
Parent(s):
1358c21
Add tokens count to aspectemo
Browse files
clarin_datasets/aspectemo_dataset.py
CHANGED
@@ -49,6 +49,7 @@ class AspectEmoDataset(DatasetToShow):
|
|
49 |
description = st.container()
|
50 |
dataframe_head = st.container()
|
51 |
class_distribution = st.container()
|
|
|
52 |
|
53 |
with header:
|
54 |
st.title(self.dataset_name)
|
@@ -58,6 +59,11 @@ class AspectEmoDataset(DatasetToShow):
|
|
58 |
st.write(self.description)
|
59 |
|
60 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
|
|
|
|
|
|
|
|
|
|
61 |
with dataframe_head:
|
62 |
df_to_show = full_dataframe.head(10)
|
63 |
st.header("First 10 observations of the dataset")
|
@@ -66,14 +72,16 @@ class AspectEmoDataset(DatasetToShow):
|
|
66 |
|
67 |
class_distribution_dict = {}
|
68 |
for subset in self.subsets:
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
class_distribution_dict[subset] = (
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
)
|
78 |
|
79 |
class_distribution_df = pd.merge(
|
@@ -84,4 +92,33 @@ class AspectEmoDataset(DatasetToShow):
|
|
84 |
with class_distribution:
|
85 |
st.header("Class distribution in each subset (without '0')")
|
86 |
st.dataframe(class_distribution_df)
|
87 |
-
st.text_area(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
description = st.container()
|
50 |
dataframe_head = st.container()
|
51 |
class_distribution = st.container()
|
52 |
+
most_common_tokens = st.container()
|
53 |
|
54 |
with header:
|
55 |
st.title(self.dataset_name)
|
|
|
59 |
st.write(self.description)
|
60 |
|
61 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
62 |
+
tokens_all = full_dataframe["tokens"].tolist()
|
63 |
+
tokens_all = [x for subarray in tokens_all for x in subarray]
|
64 |
+
labels_all = full_dataframe["labels"].tolist()
|
65 |
+
labels_all = [x for subarray in labels_all for x in subarray]
|
66 |
+
|
67 |
with dataframe_head:
|
68 |
df_to_show = full_dataframe.head(10)
|
69 |
st.header("First 10 observations of the dataset")
|
|
|
72 |
|
73 |
class_distribution_dict = {}
|
74 |
for subset in self.subsets:
|
75 |
+
all_labels_from_subset = self.data_dict[subset]["labels"].tolist()
|
76 |
+
all_labels_from_subset = [
|
77 |
+
x for subarray in all_labels_from_subset for x in subarray if x != 0
|
78 |
+
]
|
79 |
+
all_labels_from_subset = pd.Series(all_labels_from_subset)
|
80 |
class_distribution_dict[subset] = (
|
81 |
+
all_labels_from_subset.value_counts(normalize=True)
|
82 |
+
.sort_index()
|
83 |
+
.reset_index()
|
84 |
+
.rename({"index": "class", 0: subset}, axis="columns")
|
85 |
)
|
86 |
|
87 |
class_distribution_df = pd.merge(
|
|
|
92 |
with class_distribution:
|
93 |
st.header("Class distribution in each subset (without '0')")
|
94 |
st.dataframe(class_distribution_df)
|
95 |
+
st.text_area(
|
96 |
+
label="LaTeX code", value=class_distribution_df.style.to_latex()
|
97 |
+
)
|
98 |
+
|
99 |
+
# Most common tokens from selected class (without 0)
|
100 |
+
full_df_unzipped = pd.DataFrame(
|
101 |
+
{
|
102 |
+
"token": tokens_all,
|
103 |
+
"label": labels_all,
|
104 |
+
}
|
105 |
+
)
|
106 |
+
full_df_unzipped = full_df_unzipped.loc[full_df_unzipped["label"] != 0]
|
107 |
+
possible_options = sorted(full_df_unzipped["label"].unique())
|
108 |
+
with most_common_tokens:
|
109 |
+
st.header("10 most common tokens from selected class (without '0')")
|
110 |
+
selected_class = st.selectbox(
|
111 |
+
label="Select class to show", options=possible_options
|
112 |
+
)
|
113 |
+
df_to_show = (
|
114 |
+
full_df_unzipped.loc[full_df_unzipped["label"] == selected_class]
|
115 |
+
.groupby(["token"])
|
116 |
+
.count()
|
117 |
+
.reset_index()
|
118 |
+
.rename({"label": "no_of_occurrences"}, axis=1)
|
119 |
+
.sort_values(by="no_of_occurrences", ascending=False)
|
120 |
+
.reset_index(drop=True)
|
121 |
+
.head(10)
|
122 |
+
)
|
123 |
+
st.dataframe(df_to_show)
|
124 |
+
st.text_area(label="LaTeX code", value=df_to_show.style.to_latex())
|