Spaces:
Runtime error
Runtime error
Add more information and improve performance
Browse files- support_functions.py +58 -0
- visualize_dataset.py +17 -1
- visualize_pipeline.py +14 -1
support_functions.py
CHANGED
@@ -262,6 +262,64 @@ class HealthseaSearch:
|
|
262 |
|
263 |
return df
|
264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
class HealthseaPipe:
|
267 |
|
|
|
262 |
|
263 |
return df
|
264 |
|
265 |
+
# Get all health aspect indices
|
266 |
+
def get_all_conditions(self):
|
267 |
+
condition_list = []
|
268 |
+
for condition_key in self.conditions:
|
269 |
+
condition_list.append((self.conditions[condition_key]["frequency"],condition_key))
|
270 |
+
|
271 |
+
condition_list = sorted(condition_list, key=lambda tup: tup[0], reverse=True)
|
272 |
+
return condition_list
|
273 |
+
|
274 |
+
def get_all_conditions_df(self):
|
275 |
+
condition_list = self.get_all_conditions()[:1000]
|
276 |
+
condition_data = {
|
277 |
+
"Condition": [],
|
278 |
+
"Frequency": []
|
279 |
+
}
|
280 |
+
for condition in condition_list:
|
281 |
+
condition_data["Frequency"].append(condition[0])
|
282 |
+
condition_data["Condition"].append(condition[1])
|
283 |
+
|
284 |
+
datatypes = {
|
285 |
+
"Frequency": int,
|
286 |
+
"Condition": str
|
287 |
+
}
|
288 |
+
|
289 |
+
df = pd.DataFrame(data=condition_data)
|
290 |
+
df = df.astype(datatypes)
|
291 |
+
|
292 |
+
return df
|
293 |
+
|
294 |
+
|
295 |
+
def get_all_benefits(self):
|
296 |
+
benefit_list = []
|
297 |
+
for benefit_key in self.benefits:
|
298 |
+
benefit_list.append((self.benefits[benefit_key]["frequency"],benefit_key))
|
299 |
+
|
300 |
+
benefit_list = sorted(benefit_list, key=lambda tup: tup[0], reverse=True)
|
301 |
+
return benefit_list
|
302 |
+
|
303 |
+
def get_all_benefits_df(self):
|
304 |
+
benefit_list = self.get_all_benefits()[:1000]
|
305 |
+
benefit_data = {
|
306 |
+
"Benefit": [],
|
307 |
+
"Frequency": []
|
308 |
+
}
|
309 |
+
for benefit in benefit_list:
|
310 |
+
benefit_data["Frequency"].append(benefit[0])
|
311 |
+
benefit_data["Benefit"].append(benefit[1])
|
312 |
+
|
313 |
+
datatypes = {
|
314 |
+
"Frequency": int,
|
315 |
+
"Benefit": str
|
316 |
+
}
|
317 |
+
|
318 |
+
df = pd.DataFrame(data=benefit_data)
|
319 |
+
df = df.astype(datatypes)
|
320 |
+
|
321 |
+
return df
|
322 |
+
|
323 |
|
324 |
class HealthseaPipe:
|
325 |
|
visualize_dataset.py
CHANGED
@@ -10,7 +10,6 @@ def visualize_dataset():
|
|
10 |
condition_path = Path("data/condition_vectors.json")
|
11 |
benefit_path = Path("data/benefit_vectors.json")
|
12 |
|
13 |
-
|
14 |
# Load data
|
15 |
@st.cache(allow_output_mutation=True)
|
16 |
def load_data(
|
@@ -53,6 +52,8 @@ def visualize_dataset():
|
|
53 |
|
54 |
# KPI
|
55 |
|
|
|
|
|
56 |
st.markdown("""---""")
|
57 |
|
58 |
st.markdown(central_text("π Dataset"), unsafe_allow_html=True)
|
@@ -66,6 +67,17 @@ def visualize_dataset():
|
|
66 |
|
67 |
st.markdown("""---""")
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
# Search
|
70 |
search = st.text_input(label="Search for an health aspect", value="joint pain")
|
71 |
n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
|
@@ -73,6 +85,8 @@ def visualize_dataset():
|
|
73 |
st.markdown("""---""")
|
74 |
st.markdown(central_text("π§ Products"), unsafe_allow_html=True)
|
75 |
|
|
|
|
|
76 |
# DataFrame
|
77 |
st.write(search_engine.get_products_df(search, n))
|
78 |
|
@@ -101,6 +115,7 @@ def visualize_dataset():
|
|
101 |
current_aspect = search_engine.get_aspect_meta(aspect)
|
102 |
vectors.append((current_aspect["name"], current_aspect["vector"]))
|
103 |
st.markdown("\n")
|
|
|
104 |
st.write(search_engine.tsne_plot(vectors))
|
105 |
|
106 |
else:
|
@@ -118,6 +133,7 @@ def visualize_dataset():
|
|
118 |
|
119 |
# Substances
|
120 |
st.markdown(central_text("π― Substances"), unsafe_allow_html=True)
|
|
|
121 |
|
122 |
# DataFrame
|
123 |
st.write(search_engine.get_substances_df(search, n))
|
|
|
10 |
condition_path = Path("data/condition_vectors.json")
|
11 |
benefit_path = Path("data/benefit_vectors.json")
|
12 |
|
|
|
13 |
# Load data
|
14 |
@st.cache(allow_output_mutation=True)
|
15 |
def load_data(
|
|
|
52 |
|
53 |
# KPI
|
54 |
|
55 |
+
st.markdown("""This app presents the analyzed dataset of up to one million reviews. You can search for the best products and substances to any health aspect based on what reviewers wrote in their reviews.""")
|
56 |
+
|
57 |
st.markdown("""---""")
|
58 |
|
59 |
st.markdown(central_text("π Dataset"), unsafe_allow_html=True)
|
|
|
67 |
|
68 |
st.markdown("""---""")
|
69 |
|
70 |
+
# Expander
|
71 |
+
show_conditions, show_benefits = st.columns(2)
|
72 |
+
|
73 |
+
with show_conditions.expander("Top 1000 mentioned Conditions"):
|
74 |
+
st.write(search_engine.get_all_conditions_df())
|
75 |
+
|
76 |
+
with show_benefits.expander("Top 1000 mentioned Benefits"):
|
77 |
+
st.write(search_engine.get_all_benefits_df())
|
78 |
+
|
79 |
+
st.markdown("""---""")
|
80 |
+
|
81 |
# Search
|
82 |
search = st.text_input(label="Search for an health aspect", value="joint pain")
|
83 |
n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
|
|
|
85 |
st.markdown("""---""")
|
86 |
st.markdown(central_text("π§ Products"), unsafe_allow_html=True)
|
87 |
|
88 |
+
st.markdown("""The products are scored based on what reviewers say. Additional variables in the scoring function are product rating, helpful count and whether the review is considered 'fake'. """)
|
89 |
+
|
90 |
# DataFrame
|
91 |
st.write(search_engine.get_products_df(search, n))
|
92 |
|
|
|
115 |
current_aspect = search_engine.get_aspect_meta(aspect)
|
116 |
vectors.append((current_aspect["name"], current_aspect["vector"]))
|
117 |
st.markdown("\n")
|
118 |
+
st.markdown("""To improve the search, the table also shows results of other health aspects with a high similarity""")
|
119 |
st.write(search_engine.tsne_plot(vectors))
|
120 |
|
121 |
else:
|
|
|
133 |
|
134 |
# Substances
|
135 |
st.markdown(central_text("π― Substances"), unsafe_allow_html=True)
|
136 |
+
st.markdown("""The scores of the substances are based on the products""")
|
137 |
|
138 |
# DataFrame
|
139 |
st.write(search_engine.get_substances_df(search, n))
|
visualize_pipeline.py
CHANGED
@@ -64,7 +64,9 @@ def visualize_pipeline():
|
|
64 |
# Load model
|
65 |
try:
|
66 |
load_state.markdown ("#### Loading model...")
|
67 |
-
|
|
|
|
|
68 |
|
69 |
# Download model
|
70 |
except LookupError:
|
@@ -75,6 +77,8 @@ def visualize_pipeline():
|
|
75 |
load_state.markdown ("#### Loading done!")
|
76 |
|
77 |
# Pipeline
|
|
|
|
|
78 |
st.markdown("""---""")
|
79 |
|
80 |
st.markdown(central_text("βοΈ Pipeline"), unsafe_allow_html=True)
|
@@ -85,6 +89,8 @@ def visualize_pipeline():
|
|
85 |
text = st.text_input(label="Write a review", value="This is great for joint pain!")
|
86 |
else:
|
87 |
text = st.selectbox("Predefined example reviews", example_reviews)
|
|
|
|
|
88 |
doc = nlp(text)
|
89 |
|
90 |
# NER
|
@@ -96,6 +102,8 @@ def visualize_pipeline():
|
|
96 |
colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
|
97 |
)
|
98 |
|
|
|
|
|
99 |
st.markdown("""---""")
|
100 |
|
101 |
# Segmentation, Blinding, Classification
|
@@ -112,6 +120,9 @@ def visualize_pipeline():
|
|
112 |
)
|
113 |
st.markdown("\n")
|
114 |
|
|
|
|
|
|
|
115 |
st.markdown("""---""")
|
116 |
|
117 |
# Aggregation
|
@@ -127,6 +138,8 @@ def visualize_pipeline():
|
|
127 |
)
|
128 |
st.markdown("\n")
|
129 |
|
|
|
|
|
130 |
st.markdown("""---""")
|
131 |
# Indepth
|
132 |
st.markdown("## π§ Pipeline attributes")
|
|
|
64 |
# Load model
|
65 |
try:
|
66 |
load_state.markdown ("#### Loading model...")
|
67 |
+
if "model" not in st.session_state:
|
68 |
+
nlp = spacy.load("en_healthsea")
|
69 |
+
st.session_state["model"] = nlp
|
70 |
|
71 |
# Download model
|
72 |
except LookupError:
|
|
|
77 |
load_state.markdown ("#### Loading done!")
|
78 |
|
79 |
# Pipeline
|
80 |
+
st.markdown("""This app visualizes the processing steps of the Healthsea pipeline. You can test it by writing an example review.""")
|
81 |
+
|
82 |
st.markdown("""---""")
|
83 |
|
84 |
st.markdown(central_text("βοΈ Pipeline"), unsafe_allow_html=True)
|
|
|
89 |
text = st.text_input(label="Write a review", value="This is great for joint pain!")
|
90 |
else:
|
91 |
text = st.selectbox("Predefined example reviews", example_reviews)
|
92 |
+
|
93 |
+
nlp = st.session_state["model"]
|
94 |
doc = nlp(text)
|
95 |
|
96 |
# NER
|
|
|
102 |
colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
|
103 |
)
|
104 |
|
105 |
+
st.markdown("""The first processing step is to identify Conditions or Benefits with Named Entity Recognition. Conditions are diseases, symptoms and general health problems (e.g. joint pain), while Benefits are positive desired health aspects (e.g. energy)""")
|
106 |
+
|
107 |
st.markdown("""---""")
|
108 |
|
109 |
# Segmentation, Blinding, Classification
|
|
|
120 |
)
|
121 |
st.markdown("\n")
|
122 |
|
123 |
+
st.markdown("""The review is segmented into sub-clauses and then classified by a Text Classification model. We additionally blind the found entities to improve generalization and also to inform the model about our current target entity of which we want to get the prediction of.
|
124 |
+
The Text Classification predicts four exclusive classes: 'Positive', 'Negative', 'Neutral', 'Anamnesis', they represent the health effect.""")
|
125 |
+
|
126 |
st.markdown("""---""")
|
127 |
|
128 |
# Aggregation
|
|
|
138 |
)
|
139 |
st.markdown("\n")
|
140 |
|
141 |
+
st.markdown("""Multiple classification are aggregated into one final classification.""")
|
142 |
+
|
143 |
st.markdown("""---""")
|
144 |
# Indepth
|
145 |
st.markdown("## π§ Pipeline attributes")
|