Spaces:
Runtime error
Runtime error
Init
Browse files- .gitattributes +2 -27
- .gitignore +1 -0
- README.md +4 -4
- app.py +39 -0
- data/benefit_vectors.json +3 -0
- data/condition_vectors.json +3 -0
- data/health_aspects.json +3 -0
- data/img/Jellymation.gif +3 -0
- data/products.json +3 -0
- requirements.txt +8 -0
- style.css +58 -0
- support_functions.py +296 -0
- visualize_dataset.py +128 -0
- visualize_pipeline.py +128 -0
.gitattributes
CHANGED
@@ -1,27 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
20 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
-
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
data/*.json filter=lfs diff=lfs merge=lfs -text
|
2 |
+
data/img/*.gif filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: Healthsea
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
|
|
1 |
---
|
2 |
+
title: Healthsea
|
3 |
+
emoji: 🪐
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: pink
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from visualize_dataset import visualize_dataset
|
3 |
+
from visualize_pipeline import visualize_pipeline
|
4 |
+
|
5 |
+
# Header
|
6 |
+
with open("style.css") as f:
|
7 |
+
st.markdown("<style>" + f.read() + "</style>", unsafe_allow_html=True)
|
8 |
+
|
9 |
+
st.title("Welcome to Healthsea 🪐")
|
10 |
+
|
11 |
+
intro, jellyfish = st.columns(2)
|
12 |
+
jellyfish.markdown("\n")
|
13 |
+
|
14 |
+
data_load_state = intro.subheader("Create easier access to health✨")
|
15 |
+
|
16 |
+
jellyfish.image("data/img/Jellymation.gif")
|
17 |
+
intro.markdown(
|
18 |
+
"Healthsea is a spaCy v3 pipeline that analyzes user reviews to supplement products by extracting their effects on health."
|
19 |
+
)
|
20 |
+
intro.markdown(
|
21 |
+
"""With this app, you're able to explore the results of healthsea on up to 1 million reviews.
|
22 |
+
You can search for any health aspect, whether it is an disease (e.g. joint pain) or a desired health effect such as (e.g. energy),
|
23 |
+
the app returns a list of the best products and substances. You can also explore the capabilities of the pipeline itself, by writing custom reviews and
|
24 |
+
see every processing step of the pipeline.
|
25 |
+
"""
|
26 |
+
)
|
27 |
+
intro.markdown(
|
28 |
+
"""If you want to learn more about healthsea, you can read more in our [blog post]().
|
29 |
+
"""
|
30 |
+
)
|
31 |
+
|
32 |
+
st.markdown("""---""")
|
33 |
+
|
34 |
+
app_type = st.selectbox("Choose app", ["Visualize dataset", "Visualize pipeline"])
|
35 |
+
|
36 |
+
if app_type == "Visualize dataset":
|
37 |
+
visualize_dataset()
|
38 |
+
else:
|
39 |
+
visualize_pipeline()
|
data/benefit_vectors.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c77f19346af726d403cb571589e9d5802385c665dfb358a86591ebdd5c43e084
|
3 |
+
size 53173260
|
data/condition_vectors.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d8700f555d2fb6c643bead407f97ee14ebaa8e1d491a16af92026c719a3d91b
|
3 |
+
size 192093565
|
data/health_aspects.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09840d8b5e503a8f62bd4bcc6455348453f111321cc108be1f115a550a34757a
|
3 |
+
size 23936080
|
data/img/Jellymation.gif
ADDED
Git LFS Details
|
data/products.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19606c9ad43abb4e9b7b679e9229b2c2101b5a748de4b5ba2c3baec4fde2f73f
|
3 |
+
size 56608006
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit>=1.2.0
|
2 |
+
plotly>=5.4.0
|
3 |
+
scikit-learn>=1.0.1
|
4 |
+
spacy-streamlit>=1.0.2
|
5 |
+
spacy>=3.1.4
|
6 |
+
benepar>=0.2.0
|
7 |
+
|
8 |
+
https://huggingface.co/edichief/en_healthsea/resolve/main/en_healthsea-any-py3-none-any.whl
|
style.css
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.kpi{
|
2 |
+
text-align: center;
|
3 |
+
border-style: solid;
|
4 |
+
border-width: 1px;
|
5 |
+
border-radius: 5px;
|
6 |
+
border-color: #3b3b4d;
|
7 |
+
box-shadow: 0px 5px #3b3b4d;
|
8 |
+
}
|
9 |
+
|
10 |
+
.kpi:hover {
|
11 |
+
transform: scale(1.1);
|
12 |
+
}
|
13 |
+
|
14 |
+
.central_text{
|
15 |
+
text-align: center;
|
16 |
+
top: 50%;
|
17 |
+
}
|
18 |
+
|
19 |
+
.clause{
|
20 |
+
text-align: center;
|
21 |
+
border-style: solid;
|
22 |
+
border-width: 1px;
|
23 |
+
border-radius: 5px;
|
24 |
+
border-color: #1B7735;
|
25 |
+
box-shadow: 0px 5px #1B7735;
|
26 |
+
color: white;
|
27 |
+
margin-left: 10%;
|
28 |
+
margin-right: 10%;
|
29 |
+
padding-top: 2%;
|
30 |
+
padding-bottom: 2%;
|
31 |
+
background-color: #3C9E58;
|
32 |
+
z-index: 5;
|
33 |
+
display: block;
|
34 |
+
position: relative;
|
35 |
+
}
|
36 |
+
|
37 |
+
.clause:hover {
|
38 |
+
transform: scale(1.1);
|
39 |
+
}
|
40 |
+
|
41 |
+
.clause_text{
|
42 |
+
font-weight: bold;
|
43 |
+
}
|
44 |
+
|
45 |
+
.clause_meta{
|
46 |
+
text-align: center;
|
47 |
+
border-style: solid;
|
48 |
+
border-width: 1px;
|
49 |
+
border-radius: 5px;
|
50 |
+
border-color: #0c0c0e;
|
51 |
+
margin-left: 10%;
|
52 |
+
margin-right: 10%;
|
53 |
+
padding-top: 2%;
|
54 |
+
padding-bottom: 2%;
|
55 |
+
z-index: 3;
|
56 |
+
display: block;
|
57 |
+
position: relative;
|
58 |
+
}
|
support_functions.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import difflib
|
3 |
+
from spacy.tokens import Doc
|
4 |
+
|
5 |
+
import plotly
|
6 |
+
import plotly.graph_objs as go
|
7 |
+
from sklearn.manifold import TSNE
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
|
11 |
+
class HealthseaSearch:
|
12 |
+
def __init__(self, _health_aspects, _products, _conditions, _benefits):
|
13 |
+
self.health_aspects = _health_aspects
|
14 |
+
self.products = _products
|
15 |
+
self.conditions = _conditions
|
16 |
+
self.benefits = _benefits
|
17 |
+
|
18 |
+
def __call__(self, query):
|
19 |
+
return query
|
20 |
+
|
21 |
+
# Load product meta
|
22 |
+
def get_products(self, _aspect, n):
|
23 |
+
product_list = []
|
24 |
+
product_ids = {}
|
25 |
+
_n = n
|
26 |
+
_aspect = _aspect.replace(" ", "_")
|
27 |
+
if _aspect in self.health_aspects:
|
28 |
+
aspect = self.health_aspects[_aspect]
|
29 |
+
else:
|
30 |
+
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
|
31 |
+
0
|
32 |
+
]
|
33 |
+
aspect = self.health_aspects[_aspect]
|
34 |
+
|
35 |
+
product_scoring = aspect["products"]
|
36 |
+
if n != 0:
|
37 |
+
if n > len(product_scoring):
|
38 |
+
n = len(product_scoring)
|
39 |
+
product_scoring = aspect["products"][:n]
|
40 |
+
|
41 |
+
for product in product_scoring:
|
42 |
+
if product[1] not in product_ids:
|
43 |
+
product_list.append((product[0], self.products[product[1]], _aspect))
|
44 |
+
product_ids[product[1]] = 1
|
45 |
+
|
46 |
+
for alias in aspect["alias"]:
|
47 |
+
n = _n
|
48 |
+
_product_scoring = self.health_aspects[alias]["products"]
|
49 |
+
if n != 0:
|
50 |
+
if n > len(_product_scoring):
|
51 |
+
n = len(_product_scoring)
|
52 |
+
_product_scoring = self.health_aspects[alias]["products"][:n]
|
53 |
+
|
54 |
+
for product in _product_scoring:
|
55 |
+
if product[1] not in product_ids:
|
56 |
+
product_list.append((product[0], self.products[product[1]], alias))
|
57 |
+
product_ids[product[1]] = 1
|
58 |
+
|
59 |
+
n = _n
|
60 |
+
if len(product_list) > n and n != 0:
|
61 |
+
product_list = product_list[:n]
|
62 |
+
product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True)
|
63 |
+
|
64 |
+
return product_list
|
65 |
+
|
66 |
+
# Load product meta and return as DataFrame
|
67 |
+
def get_products_df(self, _aspect, n):
|
68 |
+
product_list = self.get_products(_aspect, n)
|
69 |
+
product_data = {
|
70 |
+
"product": [],
|
71 |
+
"score": [],
|
72 |
+
"health_aspect": [],
|
73 |
+
"rating": [],
|
74 |
+
"reviews": [],
|
75 |
+
}
|
76 |
+
for product in product_list:
|
77 |
+
product_data["score"].append(product[0])
|
78 |
+
product_data["product"].append(product[1]["name"])
|
79 |
+
product_data["health_aspect"].append(product[2])
|
80 |
+
product_data["rating"].append(product[1]["rating"])
|
81 |
+
product_data["reviews"].append(product[1]["review_count"])
|
82 |
+
|
83 |
+
datatypes = {
|
84 |
+
"product": str,
|
85 |
+
"score": int,
|
86 |
+
"health_aspect": str,
|
87 |
+
"rating": str,
|
88 |
+
"reviews": int,
|
89 |
+
}
|
90 |
+
|
91 |
+
df = pd.DataFrame(data=product_data)
|
92 |
+
df = df.astype(datatypes)
|
93 |
+
|
94 |
+
return df
|
95 |
+
|
96 |
+
# Get health aspect
|
97 |
+
def get_aspect(self, _aspect):
|
98 |
+
_aspect = _aspect.replace(" ", "_")
|
99 |
+
if _aspect in self.health_aspects:
|
100 |
+
return self.health_aspects[_aspect]
|
101 |
+
else:
|
102 |
+
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
|
103 |
+
0
|
104 |
+
]
|
105 |
+
return self.health_aspects[_aspect]
|
106 |
+
|
107 |
+
# Get health aspect meta
|
108 |
+
def get_aspect_meta(self, _aspect):
|
109 |
+
_aspect = _aspect.replace(" ", "_")
|
110 |
+
if _aspect in self.conditions:
|
111 |
+
return self.conditions[_aspect]
|
112 |
+
elif _aspect in self.benefits:
|
113 |
+
return self.benefits[_aspect]
|
114 |
+
else:
|
115 |
+
_aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0]
|
116 |
+
return self.conditions[_aspect]
|
117 |
+
|
118 |
+
# Plotting vectors (2D/3D)
|
119 |
+
def tsne_plot(self, dataset):
|
120 |
+
"Creates and TSNE model and plots it"
|
121 |
+
labels = []
|
122 |
+
tokens = []
|
123 |
+
|
124 |
+
for i in dataset:
|
125 |
+
tokens.append(np.array(i[1]))
|
126 |
+
labels.append(i[0])
|
127 |
+
|
128 |
+
if len(dataset) > 2:
|
129 |
+
tsne_model = TSNE(
|
130 |
+
perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23
|
131 |
+
)
|
132 |
+
|
133 |
+
new_values = tsne_model.fit_transform(tokens)
|
134 |
+
|
135 |
+
x = []
|
136 |
+
y = []
|
137 |
+
z = []
|
138 |
+
for value in new_values:
|
139 |
+
x.append(value[0])
|
140 |
+
y.append(value[1])
|
141 |
+
z.append(value[2])
|
142 |
+
|
143 |
+
trace = go.Scatter3d(
|
144 |
+
x=x,
|
145 |
+
y=y,
|
146 |
+
z=z,
|
147 |
+
text=labels,
|
148 |
+
textposition="top right",
|
149 |
+
mode="lines+markers+text",
|
150 |
+
marker={
|
151 |
+
"size": 10,
|
152 |
+
"opacity": 0.8,
|
153 |
+
},
|
154 |
+
)
|
155 |
+
|
156 |
+
# Configure the layout.
|
157 |
+
layout = go.Layout(
|
158 |
+
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
|
159 |
+
)
|
160 |
+
|
161 |
+
data = [trace]
|
162 |
+
|
163 |
+
return go.Figure(data=data, layout=layout)
|
164 |
+
|
165 |
+
else:
|
166 |
+
tsne_model = TSNE(
|
167 |
+
perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23
|
168 |
+
)
|
169 |
+
|
170 |
+
new_values = tsne_model.fit_transform(tokens)
|
171 |
+
|
172 |
+
x = []
|
173 |
+
y = []
|
174 |
+
for value in new_values:
|
175 |
+
x.append(value[0])
|
176 |
+
y.append(value[1])
|
177 |
+
|
178 |
+
trace = go.Scatter(
|
179 |
+
x=x,
|
180 |
+
y=y,
|
181 |
+
text=labels,
|
182 |
+
textposition="top right",
|
183 |
+
mode="lines+markers+text",
|
184 |
+
marker={
|
185 |
+
"size": 10,
|
186 |
+
"opacity": 0.8,
|
187 |
+
},
|
188 |
+
)
|
189 |
+
|
190 |
+
# Configure the layout.
|
191 |
+
layout = go.Layout(
|
192 |
+
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
|
193 |
+
)
|
194 |
+
|
195 |
+
data = [trace]
|
196 |
+
|
197 |
+
return go.Figure(data=data, layout=layout)
|
198 |
+
|
199 |
+
# Load substance meta
|
200 |
+
def get_substances(self, _aspect, n):
|
201 |
+
substance_list = []
|
202 |
+
substance_ids = {}
|
203 |
+
exclude = ["sodium", "sugar", "sugar_alcohol"]
|
204 |
+
_n = n
|
205 |
+
_aspect = _aspect.replace(" ", "_")
|
206 |
+
if _aspect in self.health_aspects:
|
207 |
+
aspect = self.health_aspects[_aspect]
|
208 |
+
else:
|
209 |
+
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
|
210 |
+
0
|
211 |
+
]
|
212 |
+
aspect = self.health_aspects[_aspect]
|
213 |
+
|
214 |
+
substance_scoring = aspect["substance"]
|
215 |
+
if n != 0:
|
216 |
+
if n > len(substance_scoring):
|
217 |
+
n = len(substance_scoring)
|
218 |
+
substance_scoring = aspect["substance"][:n]
|
219 |
+
|
220 |
+
for substance in substance_scoring:
|
221 |
+
if substance[1] in exclude:
|
222 |
+
continue
|
223 |
+
if substance[1] not in substance_ids:
|
224 |
+
substance_list.append((substance[0], substance[1], _aspect))
|
225 |
+
substance_ids[substance[1]] = 1
|
226 |
+
|
227 |
+
for alias in aspect["alias"]:
|
228 |
+
n = _n
|
229 |
+
_substance_scoring = self.health_aspects[alias]["substance"]
|
230 |
+
if n != 0:
|
231 |
+
if n > len(_substance_scoring):
|
232 |
+
n = len(_substance_scoring)
|
233 |
+
_substance_scoring = self.health_aspects[alias]["substance"][:n]
|
234 |
+
|
235 |
+
for substance in _substance_scoring:
|
236 |
+
if substance[1] in exclude:
|
237 |
+
continue
|
238 |
+
if substance[1] not in substance_ids:
|
239 |
+
substance_list.append((substance[0], substance[1], alias))
|
240 |
+
substance_ids[substance[1]] = 1
|
241 |
+
|
242 |
+
n = _n
|
243 |
+
if len(substance_list) > n and n != 0:
|
244 |
+
substance_list = substance_list[:n]
|
245 |
+
substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True)
|
246 |
+
|
247 |
+
return substance_list
|
248 |
+
|
249 |
+
# Load substance meta and return as DataFrame
|
250 |
+
def get_substances_df(self, _aspect, n):
|
251 |
+
substance_list = self.get_substances(_aspect, n)
|
252 |
+
substance_data = {"substance": [], "score": [], "health_aspect": []}
|
253 |
+
for substance in substance_list:
|
254 |
+
substance_data["score"].append(substance[0])
|
255 |
+
substance_data["substance"].append(substance[1])
|
256 |
+
substance_data["health_aspect"].append(substance[2])
|
257 |
+
|
258 |
+
datatypes = {"substance": str, "score": int, "health_aspect": str}
|
259 |
+
|
260 |
+
df = pd.DataFrame(data=substance_data)
|
261 |
+
df = df.astype(datatypes)
|
262 |
+
|
263 |
+
return df
|
264 |
+
|
265 |
+
|
266 |
+
class HealthseaPipe:
|
267 |
+
|
268 |
+
# Get Clauses and their predictions
|
269 |
+
def get_clauses(self, doc):
|
270 |
+
clauses = []
|
271 |
+
for clause in doc._.clauses:
|
272 |
+
words = []
|
273 |
+
spaces = []
|
274 |
+
clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]]
|
275 |
+
|
276 |
+
if clause["has_ent"]:
|
277 |
+
for token in clause_slice:
|
278 |
+
if token.i == clause["ent_indices"][0]:
|
279 |
+
words.append(
|
280 |
+
clause["blinder"].replace(">", "").replace("<", "")
|
281 |
+
)
|
282 |
+
spaces.append(True)
|
283 |
+
elif token.i not in range(
|
284 |
+
clause["ent_indices"][0], clause["ent_indices"][1]
|
285 |
+
):
|
286 |
+
words.append(token.text)
|
287 |
+
spaces.append(token.whitespace_)
|
288 |
+
clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
|
289 |
+
|
290 |
+
else:
|
291 |
+
for token in clause_slice:
|
292 |
+
words.append(token.text)
|
293 |
+
spaces.append(token.whitespace_)
|
294 |
+
clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
|
295 |
+
|
296 |
+
return clauses
|
visualize_dataset.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pathlib import Path
|
3 |
+
import json
|
4 |
+
from support_functions import HealthseaSearch
|
5 |
+
|
6 |
+
def visualize_dataset():
|
7 |
+
# Configuration
|
8 |
+
health_aspect_path = Path("data/health_aspects.json")
|
9 |
+
product_path = Path("data/products.json")
|
10 |
+
condition_path = Path("data/condition_vectors.json")
|
11 |
+
benefit_path = Path("data/benefit_vectors.json")
|
12 |
+
|
13 |
+
|
14 |
+
# Load data
|
15 |
+
@st.cache(allow_output_mutation=True)
|
16 |
+
def load_data(
|
17 |
+
_health_aspect_path: Path,
|
18 |
+
_product_path: Path,
|
19 |
+
_condition_path: Path,
|
20 |
+
_benefit_path: Path,
|
21 |
+
):
|
22 |
+
with open(_health_aspect_path) as reader:
|
23 |
+
health_aspects = json.load(reader)
|
24 |
+
with open(_product_path) as reader:
|
25 |
+
products = json.load(reader)
|
26 |
+
with open(_condition_path) as reader:
|
27 |
+
conditions = json.load(reader)
|
28 |
+
with open(_benefit_path) as reader:
|
29 |
+
benefits = json.load(reader)
|
30 |
+
return health_aspects, products, conditions, benefits
|
31 |
+
|
32 |
+
|
33 |
+
# Functions
|
34 |
+
def kpi(n, text):
|
35 |
+
html = f"""
|
36 |
+
<div class='kpi'>
|
37 |
+
<h1 class='kpi_header'>{n}</h1>
|
38 |
+
<span>{text}</span>
|
39 |
+
</div>
|
40 |
+
"""
|
41 |
+
return html
|
42 |
+
|
43 |
+
|
44 |
+
def central_text(text):
|
45 |
+
html = f"""<h2 class='central_text'>{text}</h2>"""
|
46 |
+
return html
|
47 |
+
|
48 |
+
# Loading data
|
49 |
+
health_aspects, products, conditions, benefits = load_data(
|
50 |
+
health_aspect_path, product_path, condition_path, benefit_path
|
51 |
+
)
|
52 |
+
search_engine = HealthseaSearch(health_aspects, products, conditions, benefits)
|
53 |
+
|
54 |
+
# KPI
|
55 |
+
|
56 |
+
st.markdown("""---""")
|
57 |
+
|
58 |
+
st.markdown(central_text("🎀 Dataset"), unsafe_allow_html=True)
|
59 |
+
|
60 |
+
kpi_products, kpi_reviews, kpi_condition, kpi_benefit = st.columns(4)
|
61 |
+
|
62 |
+
kpi_products.markdown(kpi(len(products), "Products"), unsafe_allow_html=True)
|
63 |
+
kpi_reviews.markdown(kpi(933.240, "Reviews"), unsafe_allow_html=True)
|
64 |
+
kpi_condition.markdown(kpi(len(conditions), "Conditions"), unsafe_allow_html=True)
|
65 |
+
kpi_benefit.markdown(kpi(len(benefits), "Benefits"), unsafe_allow_html=True)
|
66 |
+
|
67 |
+
st.markdown("""---""")
|
68 |
+
|
69 |
+
# Search
|
70 |
+
search = st.text_input(label="Search for an health aspect", value="joint pain")
|
71 |
+
n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
|
72 |
+
|
73 |
+
st.markdown("""---""")
|
74 |
+
st.markdown(central_text("🧃 Products"), unsafe_allow_html=True)
|
75 |
+
|
76 |
+
# DataFrame
|
77 |
+
st.write(search_engine.get_products_df(search, n))
|
78 |
+
|
79 |
+
# KPI & Alias
|
80 |
+
aspect_alias = search_engine.get_aspect(search)["alias"]
|
81 |
+
|
82 |
+
if len(aspect_alias) > 0:
|
83 |
+
kpi_mentions, kpi_product_mentions, kpi_alias = st.columns(3)
|
84 |
+
kpi_mentions.markdown(
|
85 |
+
kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
|
86 |
+
unsafe_allow_html=True,
|
87 |
+
)
|
88 |
+
kpi_product_mentions.markdown(
|
89 |
+
kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
|
90 |
+
unsafe_allow_html=True,
|
91 |
+
)
|
92 |
+
kpi_alias.markdown(
|
93 |
+
kpi(len(aspect_alias), "Similar health aspects"),
|
94 |
+
unsafe_allow_html=True,
|
95 |
+
)
|
96 |
+
|
97 |
+
vectors = []
|
98 |
+
main_aspect = search_engine.get_aspect_meta(search)
|
99 |
+
vectors.append((main_aspect["name"], main_aspect["vector"]))
|
100 |
+
for aspect in aspect_alias:
|
101 |
+
current_aspect = search_engine.get_aspect_meta(aspect)
|
102 |
+
vectors.append((current_aspect["name"], current_aspect["vector"]))
|
103 |
+
st.markdown("\n")
|
104 |
+
st.write(search_engine.tsne_plot(vectors))
|
105 |
+
|
106 |
+
else:
|
107 |
+
kpi_mentions, kpi_product_mentions = st.columns(2)
|
108 |
+
kpi_mentions.markdown(
|
109 |
+
kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
|
110 |
+
unsafe_allow_html=True,
|
111 |
+
)
|
112 |
+
kpi_product_mentions.markdown(
|
113 |
+
kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
|
114 |
+
unsafe_allow_html=True,
|
115 |
+
)
|
116 |
+
|
117 |
+
st.markdown("""---""")
|
118 |
+
|
119 |
+
# Substances
|
120 |
+
st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
|
121 |
+
|
122 |
+
# DataFrame
|
123 |
+
st.write(search_engine.get_substances_df(search, n))
|
124 |
+
kpi_tmp, kpi_substances = st.columns(2)
|
125 |
+
kpi_substances.markdown(
|
126 |
+
kpi(len(search_engine.get_aspect(search)["substance"]), "Substances"),
|
127 |
+
unsafe_allow_html=True,
|
128 |
+
)
|
visualize_pipeline.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import spacy
|
3 |
+
from spacy_streamlit import visualize_ner
|
4 |
+
from support_functions import HealthseaPipe
|
5 |
+
import operator
|
6 |
+
|
7 |
+
def visualize_pipeline():
|
8 |
+
healthsea_pipe = HealthseaPipe()
|
9 |
+
|
10 |
+
color_code = {
|
11 |
+
"POSITIVE": ("#3C9E58", "#1B7735"),
|
12 |
+
"NEGATIVE": ("#FF166A", "#C0094B"),
|
13 |
+
"NEUTRAL": ("#7E7E7E", "#4E4747"),
|
14 |
+
"ANAMNESIS": ("#E49A55", "#AD6B2D"),
|
15 |
+
}
|
16 |
+
|
17 |
+
example_reviews = [
|
18 |
+
"This is great for joint pain.",
|
19 |
+
"This help joint pain but causes rashes",
|
20 |
+
"I'm diagnosed with gastritis. This product helped!",
|
21 |
+
"Made my insomnia worse",
|
22 |
+
"Didn't help my energy levels",
|
23 |
+
]
|
24 |
+
|
25 |
+
# Functions
|
26 |
+
def kpi(n, text):
|
27 |
+
html = f"""
|
28 |
+
<div class='kpi'>
|
29 |
+
<h1>{n}</h1>
|
30 |
+
<span>{text}</span>
|
31 |
+
</div>
|
32 |
+
"""
|
33 |
+
return html
|
34 |
+
|
35 |
+
|
36 |
+
def central_text(text):
|
37 |
+
html = f"""<h2 class='central_text'>{text}</h2>"""
|
38 |
+
return html
|
39 |
+
|
40 |
+
|
41 |
+
def format_clause(text, meta, pred):
|
42 |
+
html = f"""
|
43 |
+
<div>
|
44 |
+
<div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
|
45 |
+
<div class="clause_text">{text}</div>
|
46 |
+
</div>
|
47 |
+
<div class="clause_meta">
|
48 |
+
<div>{meta}</div>
|
49 |
+
</div>
|
50 |
+
</div>"""
|
51 |
+
return html
|
52 |
+
|
53 |
+
|
54 |
+
def format_effect(text, pred):
|
55 |
+
html = f"""
|
56 |
+
<div>
|
57 |
+
<div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
|
58 |
+
<div class="clause_text">{text}</div>
|
59 |
+
</div>
|
60 |
+
</div>"""
|
61 |
+
return html
|
62 |
+
|
63 |
+
# Load model
|
64 |
+
nlp = spacy.load("en_healthsea")
|
65 |
+
|
66 |
+
# Pipeline
|
67 |
+
st.markdown("""---""")
|
68 |
+
|
69 |
+
st.markdown(central_text("⚙️ Pipeline"), unsafe_allow_html=True)
|
70 |
+
|
71 |
+
check = st.checkbox("Use predefined examples")
|
72 |
+
|
73 |
+
if not check:
|
74 |
+
text = st.text_input(label="Write a review", value="This is great for joint pain!")
|
75 |
+
else:
|
76 |
+
text = st.selectbox("Predefined example reviews", example_reviews)
|
77 |
+
doc = nlp(text)
|
78 |
+
|
79 |
+
# NER
|
80 |
+
visualize_ner(
|
81 |
+
doc,
|
82 |
+
labels=nlp.get_pipe("ner").labels,
|
83 |
+
show_table=False,
|
84 |
+
title="✨ Named Entity Recognition",
|
85 |
+
colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
|
86 |
+
)
|
87 |
+
|
88 |
+
st.markdown("""---""")
|
89 |
+
|
90 |
+
# Segmentation, Blinding, Classification
|
91 |
+
st.markdown("## 🔮 Segmentation, Blinding, Classification")
|
92 |
+
|
93 |
+
clauses = healthsea_pipe.get_clauses(doc)
|
94 |
+
for doc_clause, clause in zip(clauses, doc._.clauses):
|
95 |
+
classification = max(clause["cats"].items(), key=operator.itemgetter(1))[0]
|
96 |
+
percentage = round(float(clause["cats"][classification]) * 100, 2)
|
97 |
+
meta = f"{clause['ent_name']} ({classification} {percentage}%)"
|
98 |
+
|
99 |
+
st.markdown(
|
100 |
+
format_clause(doc_clause.text, meta, classification), unsafe_allow_html=True
|
101 |
+
)
|
102 |
+
st.markdown("\n")
|
103 |
+
|
104 |
+
st.markdown("""---""")
|
105 |
+
|
106 |
+
# Aggregation
|
107 |
+
st.markdown("## 🔗 Aggregation")
|
108 |
+
|
109 |
+
for effect in doc._.health_effects:
|
110 |
+
st.markdown(
|
111 |
+
format_effect(
|
112 |
+
f"{doc._.health_effects[effect]['effect']} effect on {effect}",
|
113 |
+
doc._.health_effects[effect]["effect"],
|
114 |
+
),
|
115 |
+
unsafe_allow_html=True,
|
116 |
+
)
|
117 |
+
st.markdown("\n")
|
118 |
+
|
119 |
+
st.markdown("""---""")
|
120 |
+
# Indepth
|
121 |
+
st.markdown("## 🔧 Pipeline attributes")
|
122 |
+
clauses_col, effect_col = st.columns(2)
|
123 |
+
|
124 |
+
clauses_col.markdown("### doc._.clauses")
|
125 |
+
for clause in doc._.clauses:
|
126 |
+
clauses_col.json(clause)
|
127 |
+
effect_col.markdown("### doc._.health_effects")
|
128 |
+
effect_col.json(doc._.health_effects)
|