Spaces:

spacy
/

healthsea-demo

Runtime error

App Files Files Community

edichief commited on Nov 26, 2021

Commit

69abbc0

1 Parent(s): 1550afa

Init

Browse files

Files changed (14) hide show

.gitattributes +2 -27
.gitignore +1 -0
README.md +4 -4
app.py +39 -0
data/benefit_vectors.json +3 -0
data/condition_vectors.json +3 -0
data/health_aspects.json +3 -0
data/img/Jellymation.gif +3 -0
data/products.json +3 -0
requirements.txt +8 -0
style.css +58 -0
support_functions.py +296 -0
visualize_dataset.py +128 -0
visualize_pipeline.py +128 -0

.gitattributes CHANGED Viewed

@@ -1,27 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bin.* filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zstandard filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ data/*.json filter=lfs diff=lfs merge=lfs -text
2	+ data/img/*.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Healthsea Demo
-emoji: 👀
-colorFrom: green
-colorTo: gray
 sdk: streamlit
 app_file: app.py
 pinned: false

 ---
+title: Healthsea
+emoji: 🪐
+colorFrom: yellow
+colorTo: pink
 sdk: streamlit
 app_file: app.py
 pinned: false

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+from visualize_dataset import visualize_dataset
+from visualize_pipeline import visualize_pipeline
+# Header
+with open("style.css") as f:
+    st.markdown("<style>" + f.read() + "</style>", unsafe_allow_html=True)
+st.title("Welcome to Healthsea 🪐")
+intro, jellyfish = st.columns(2)
+jellyfish.markdown("\n")
+data_load_state = intro.subheader("Create easier access to health✨")
+jellyfish.image("data/img/Jellymation.gif")
+intro.markdown(
+    "Healthsea is a spaCy v3 pipeline that analyzes user reviews to supplement products by extracting their effects on health."
+)
+intro.markdown(
+    """With this app, you're able to explore the results of healthsea on up to 1 million reviews.
+    You can search for any health aspect, whether it is an disease (e.g. joint pain) or a desired health effect such as (e.g. energy),
+    the app returns a list of the best products and substances. You can also explore the capabilities of the pipeline itself, by writing custom reviews and
+    see every processing step of the pipeline.
+    """
+)
+intro.markdown(
+    """If you want to learn more about healthsea, you can read more in our [blog post]().
+    """
+)
+st.markdown("""---""")
+app_type = st.selectbox("Choose app", ["Visualize dataset", "Visualize pipeline"])
+if app_type == "Visualize dataset":
+    visualize_dataset()
+else:
+    visualize_pipeline()

data/benefit_vectors.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c77f19346af726d403cb571589e9d5802385c665dfb358a86591ebdd5c43e084
+size 53173260

data/condition_vectors.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d8700f555d2fb6c643bead407f97ee14ebaa8e1d491a16af92026c719a3d91b
+size 192093565

data/health_aspects.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09840d8b5e503a8f62bd4bcc6455348453f111321cc108be1f115a550a34757a
+size 23936080

data/img/Jellymation.gif ADDED Viewed

Git LFS Details

SHA256: c796dd42c6b93dbf75ca3045f44ad9471db737f1452fbcdd488c7b531aae79b1
Pointer size: 133 Bytes
Size of remote file: 25.4 MB

data/products.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19606c9ad43abb4e9b7b679e9229b2c2101b5a748de4b5ba2c3baec4fde2f73f
+size 56608006

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit>=1.2.0
+plotly>=5.4.0
+scikit-learn>=1.0.1
+spacy-streamlit>=1.0.2
+spacy>=3.1.4
+benepar>=0.2.0
+https://huggingface.co/edichief/en_healthsea/resolve/main/en_healthsea-any-py3-none-any.whl

style.css ADDED Viewed

	@@ -0,0 +1,58 @@

+.kpi{
+    text-align: center;
+    border-style: solid;
+    border-width:  1px;
+    border-radius: 5px;
+    border-color: #3b3b4d;
+    box-shadow: 0px 5px #3b3b4d;
+}
+.kpi:hover {
+    transform: scale(1.1);
+  }
+.central_text{
+    text-align: center;
+    top: 50%;
+}
+.clause{
+    text-align: center;
+    border-style: solid;
+    border-width:  1px;
+    border-radius: 5px;
+    border-color: #1B7735;
+    box-shadow: 0px 5px #1B7735;
+    color: white;
+    margin-left: 10%;
+    margin-right: 10%;
+    padding-top: 2%;
+    padding-bottom: 2%;
+    background-color: #3C9E58;
+    z-index: 5;
+    display: block;
+    position: relative;
+}
+.clause:hover {
+    transform: scale(1.1);
+  }
+.clause_text{
+    font-weight: bold;
+}
+.clause_meta{
+    text-align: center;
+    border-style: solid;
+    border-width:  1px;
+    border-radius: 5px;
+    border-color: #0c0c0e;
+    margin-left: 10%;
+    margin-right: 10%;
+    padding-top: 2%;
+    padding-bottom: 2%;
+    z-index: 3;
+    display: block;
+    position: relative;
+}

support_functions.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import pandas as pd
+import difflib
+from spacy.tokens import Doc
+import plotly
+import plotly.graph_objs as go
+from sklearn.manifold import TSNE
+import numpy as np
+class HealthseaSearch:
+    def __init__(self, _health_aspects, _products, _conditions, _benefits):
+        self.health_aspects = _health_aspects
+        self.products = _products
+        self.conditions = _conditions
+        self.benefits = _benefits
+    def __call__(self, query):
+        return query
+    # Load product meta
+    def get_products(self, _aspect, n):
+        product_list = []
+        product_ids = {}
+        _n = n
+        _aspect = _aspect.replace(" ", "_")
+        if _aspect in self.health_aspects:
+            aspect = self.health_aspects[_aspect]
+        else:
+            _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
+                0
+            ]
+            aspect = self.health_aspects[_aspect]
+        product_scoring = aspect["products"]
+        if n != 0:
+            if n > len(product_scoring):
+                n = len(product_scoring)
+            product_scoring = aspect["products"][:n]
+        for product in product_scoring:
+            if product[1] not in product_ids:
+                product_list.append((product[0], self.products[product[1]], _aspect))
+                product_ids[product[1]] = 1
+        for alias in aspect["alias"]:
+            n = _n
+            _product_scoring = self.health_aspects[alias]["products"]
+            if n != 0:
+                if n > len(_product_scoring):
+                    n = len(_product_scoring)
+                _product_scoring = self.health_aspects[alias]["products"][:n]
+            for product in _product_scoring:
+                if product[1] not in product_ids:
+                    product_list.append((product[0], self.products[product[1]], alias))
+                    product_ids[product[1]] = 1
+        n = _n
+        if len(product_list) > n and n != 0:
+            product_list = product_list[:n]
+        product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True)
+        return product_list
+    # Load product meta and return as DataFrame
+    def get_products_df(self, _aspect, n):
+        product_list = self.get_products(_aspect, n)
+        product_data = {
+            "product": [],
+            "score": [],
+            "health_aspect": [],
+            "rating": [],
+            "reviews": [],
+        }
+        for product in product_list:
+            product_data["score"].append(product[0])
+            product_data["product"].append(product[1]["name"])
+            product_data["health_aspect"].append(product[2])
+            product_data["rating"].append(product[1]["rating"])
+            product_data["reviews"].append(product[1]["review_count"])
+        datatypes = {
+            "product": str,
+            "score": int,
+            "health_aspect": str,
+            "rating": str,
+            "reviews": int,
+        }
+        df = pd.DataFrame(data=product_data)
+        df = df.astype(datatypes)
+        return df
+    # Get health aspect
+    def get_aspect(self, _aspect):
+        _aspect = _aspect.replace(" ", "_")
+        if _aspect in self.health_aspects:
+            return self.health_aspects[_aspect]
+        else:
+            _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
+                0
+            ]
+            return self.health_aspects[_aspect]
+    # Get health aspect meta
+    def get_aspect_meta(self, _aspect):
+        _aspect = _aspect.replace(" ", "_")
+        if _aspect in self.conditions:
+            return self.conditions[_aspect]
+        elif _aspect in self.benefits:
+            return self.benefits[_aspect]
+        else:
+            _aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0]
+            return self.conditions[_aspect]
+    # Plotting vectors (2D/3D)
+    def tsne_plot(self, dataset):
+        "Creates and TSNE model and plots it"
+        labels = []
+        tokens = []
+        for i in dataset:
+            tokens.append(np.array(i[1]))
+            labels.append(i[0])
+        if len(dataset) > 2:
+            tsne_model = TSNE(
+                perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23
+            )
+            new_values = tsne_model.fit_transform(tokens)
+            x = []
+            y = []
+            z = []
+            for value in new_values:
+                x.append(value[0])
+                y.append(value[1])
+                z.append(value[2])
+            trace = go.Scatter3d(
+                x=x,
+                y=y,
+                z=z,
+                text=labels,
+                textposition="top right",
+                mode="lines+markers+text",
+                marker={
+                    "size": 10,
+                    "opacity": 0.8,
+                },
+            )
+            # Configure the layout.
+            layout = go.Layout(
+                margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
+            )
+            data = [trace]
+            return go.Figure(data=data, layout=layout)
+        else:
+            tsne_model = TSNE(
+                perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23
+            )
+            new_values = tsne_model.fit_transform(tokens)
+            x = []
+            y = []
+            for value in new_values:
+                x.append(value[0])
+                y.append(value[1])
+            trace = go.Scatter(
+                x=x,
+                y=y,
+                text=labels,
+                textposition="top right",
+                mode="lines+markers+text",
+                marker={
+                    "size": 10,
+                    "opacity": 0.8,
+                },
+            )
+            # Configure the layout.
+            layout = go.Layout(
+                margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
+            )
+            data = [trace]
+            return go.Figure(data=data, layout=layout)
+    # Load substance meta
+    def get_substances(self, _aspect, n):
+        substance_list = []
+        substance_ids = {}
+        exclude = ["sodium", "sugar", "sugar_alcohol"]
+        _n = n
+        _aspect = _aspect.replace(" ", "_")
+        if _aspect in self.health_aspects:
+            aspect = self.health_aspects[_aspect]
+        else:
+            _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
+                0
+            ]
+            aspect = self.health_aspects[_aspect]
+        substance_scoring = aspect["substance"]
+        if n != 0:
+            if n > len(substance_scoring):
+                n = len(substance_scoring)
+            substance_scoring = aspect["substance"][:n]
+        for substance in substance_scoring:
+            if substance[1] in exclude:
+                continue
+            if substance[1] not in substance_ids:
+                substance_list.append((substance[0], substance[1], _aspect))
+                substance_ids[substance[1]] = 1
+        for alias in aspect["alias"]:
+            n = _n
+            _substance_scoring = self.health_aspects[alias]["substance"]
+            if n != 0:
+                if n > len(_substance_scoring):
+                    n = len(_substance_scoring)
+                _substance_scoring = self.health_aspects[alias]["substance"][:n]
+            for substance in _substance_scoring:
+                if substance[1] in exclude:
+                    continue
+                if substance[1] not in substance_ids:
+                    substance_list.append((substance[0], substance[1], alias))
+                    substance_ids[substance[1]] = 1
+        n = _n
+        if len(substance_list) > n and n != 0:
+            substance_list = substance_list[:n]
+        substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True)
+        return substance_list
+    # Load substance meta and return as DataFrame
+    def get_substances_df(self, _aspect, n):
+        substance_list = self.get_substances(_aspect, n)
+        substance_data = {"substance": [], "score": [], "health_aspect": []}
+        for substance in substance_list:
+            substance_data["score"].append(substance[0])
+            substance_data["substance"].append(substance[1])
+            substance_data["health_aspect"].append(substance[2])
+        datatypes = {"substance": str, "score": int, "health_aspect": str}
+        df = pd.DataFrame(data=substance_data)
+        df = df.astype(datatypes)
+        return df
+class HealthseaPipe:
+    # Get Clauses and their predictions
+    def get_clauses(self, doc):
+        clauses = []
+        for clause in doc._.clauses:
+            words = []
+            spaces = []
+            clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]]
+            if clause["has_ent"]:
+                for token in clause_slice:
+                    if token.i == clause["ent_indices"][0]:
+                        words.append(
+                            clause["blinder"].replace(">", "").replace("<", "")
+                        )
+                        spaces.append(True)
+                    elif token.i not in range(
+                        clause["ent_indices"][0], clause["ent_indices"][1]
+                    ):
+                        words.append(token.text)
+                        spaces.append(token.whitespace_)
+                clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
+            else:
+                for token in clause_slice:
+                    words.append(token.text)
+                    spaces.append(token.whitespace_)
+                clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
+        return clauses

visualize_dataset.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+from pathlib import Path
+import json
+from support_functions import HealthseaSearch
+def visualize_dataset():
+    # Configuration
+    health_aspect_path = Path("data/health_aspects.json")
+    product_path = Path("data/products.json")
+    condition_path = Path("data/condition_vectors.json")
+    benefit_path = Path("data/benefit_vectors.json")
+    # Load data
+    @st.cache(allow_output_mutation=True)
+    def load_data(
+        _health_aspect_path: Path,
+        _product_path: Path,
+        _condition_path: Path,
+        _benefit_path: Path,
+    ):
+        with open(_health_aspect_path) as reader:
+            health_aspects = json.load(reader)
+        with open(_product_path) as reader:
+            products = json.load(reader)
+        with open(_condition_path) as reader:
+            conditions = json.load(reader)
+        with open(_benefit_path) as reader:
+            benefits = json.load(reader)
+        return health_aspects, products, conditions, benefits
+    # Functions
+    def kpi(n, text):
+        html = f"""
+        <div class='kpi'>
+            <h1 class='kpi_header'>{n}</h1>
+            <span>{text}</span>
+        </div>
+        """
+        return html
+    def central_text(text):
+        html = f"""<h2 class='central_text'>{text}</h2>"""
+        return html
+    # Loading data
+    health_aspects, products, conditions, benefits = load_data(
+        health_aspect_path, product_path, condition_path, benefit_path
+    )
+    search_engine = HealthseaSearch(health_aspects, products, conditions, benefits)
+    # KPI
+    st.markdown("""---""")
+    st.markdown(central_text("🎀 Dataset"), unsafe_allow_html=True)
+    kpi_products, kpi_reviews, kpi_condition, kpi_benefit = st.columns(4)
+    kpi_products.markdown(kpi(len(products), "Products"), unsafe_allow_html=True)
+    kpi_reviews.markdown(kpi(933.240, "Reviews"), unsafe_allow_html=True)
+    kpi_condition.markdown(kpi(len(conditions), "Conditions"), unsafe_allow_html=True)
+    kpi_benefit.markdown(kpi(len(benefits), "Benefits"), unsafe_allow_html=True)
+    st.markdown("""---""")
+    # Search
+    search = st.text_input(label="Search for an health aspect", value="joint pain")
+    n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
+    st.markdown("""---""")
+    st.markdown(central_text("🧃 Products"), unsafe_allow_html=True)
+    # DataFrame
+    st.write(search_engine.get_products_df(search, n))
+    # KPI & Alias
+    aspect_alias = search_engine.get_aspect(search)["alias"]
+    if len(aspect_alias) > 0:
+        kpi_mentions, kpi_product_mentions, kpi_alias = st.columns(3)
+        kpi_mentions.markdown(
+            kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
+            unsafe_allow_html=True,
+        )
+        kpi_product_mentions.markdown(
+            kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
+            unsafe_allow_html=True,
+        )
+        kpi_alias.markdown(
+            kpi(len(aspect_alias), "Similar health aspects"),
+            unsafe_allow_html=True,
+        )
+        vectors = []
+        main_aspect = search_engine.get_aspect_meta(search)
+        vectors.append((main_aspect["name"], main_aspect["vector"]))
+        for aspect in aspect_alias:
+            current_aspect = search_engine.get_aspect_meta(aspect)
+            vectors.append((current_aspect["name"], current_aspect["vector"]))
+        st.markdown("\n")
+        st.write(search_engine.tsne_plot(vectors))
+    else:
+        kpi_mentions, kpi_product_mentions = st.columns(2)
+        kpi_mentions.markdown(
+            kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
+            unsafe_allow_html=True,
+        )
+        kpi_product_mentions.markdown(
+            kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
+            unsafe_allow_html=True,
+        )
+    st.markdown("""---""")
+    # Substances
+    st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
+    # DataFrame
+    st.write(search_engine.get_substances_df(search, n))
+    kpi_tmp, kpi_substances = st.columns(2)
+    kpi_substances.markdown(
+        kpi(len(search_engine.get_aspect(search)["substance"]), "Substances"),
+        unsafe_allow_html=True,
+    )

visualize_pipeline.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+import spacy
+from spacy_streamlit import visualize_ner
+from support_functions import HealthseaPipe
+import operator
+def visualize_pipeline():
+    healthsea_pipe = HealthseaPipe()
+    color_code = {
+        "POSITIVE": ("#3C9E58", "#1B7735"),
+        "NEGATIVE": ("#FF166A", "#C0094B"),
+        "NEUTRAL": ("#7E7E7E", "#4E4747"),
+        "ANAMNESIS": ("#E49A55", "#AD6B2D"),
+    }
+    example_reviews = [
+        "This is great for joint pain.",
+        "This help joint pain but causes rashes",
+        "I'm diagnosed with gastritis. This product helped!",
+        "Made my insomnia worse",
+        "Didn't help my energy levels",
+    ]
+    # Functions
+    def kpi(n, text):
+        html = f"""
+        <div class='kpi'>
+            <h1>{n}</h1>
+            <span>{text}</span>
+        </div>
+        """
+        return html
+    def central_text(text):
+        html = f"""<h2 class='central_text'>{text}</h2>"""
+        return html
+    def format_clause(text, meta, pred):
+        html = f"""
+        <div>
+            <div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
+                <div class="clause_text">{text}</div>
+            </div>
+            <div class="clause_meta">
+                <div>{meta}</div>
+            </div>
+        </div>"""
+        return html
+    def format_effect(text, pred):
+        html = f"""
+        <div>
+            <div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
+                <div class="clause_text">{text}</div>
+            </div>
+        </div>"""
+        return html
+    # Load model
+    nlp = spacy.load("en_healthsea")
+    # Pipeline
+    st.markdown("""---""")
+    st.markdown(central_text("⚙️ Pipeline"), unsafe_allow_html=True)
+    check = st.checkbox("Use predefined examples")
+    if not check:
+        text = st.text_input(label="Write a review", value="This is great for joint pain!")
+    else:
+        text = st.selectbox("Predefined example reviews", example_reviews)
+    doc = nlp(text)
+    # NER
+    visualize_ner(
+        doc,
+        labels=nlp.get_pipe("ner").labels,
+        show_table=False,
+        title="✨ Named Entity Recognition",
+        colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
+    )
+    st.markdown("""---""")
+    # Segmentation, Blinding, Classification
+    st.markdown("## 🔮 Segmentation, Blinding, Classification")
+    clauses = healthsea_pipe.get_clauses(doc)
+    for doc_clause, clause in zip(clauses, doc._.clauses):
+        classification = max(clause["cats"].items(), key=operator.itemgetter(1))[0]
+        percentage = round(float(clause["cats"][classification]) * 100, 2)
+        meta = f"{clause['ent_name']} ({classification} {percentage}%)"
+        st.markdown(
+            format_clause(doc_clause.text, meta, classification), unsafe_allow_html=True
+        )
+        st.markdown("\n")
+    st.markdown("""---""")
+    # Aggregation
+    st.markdown("## 🔗 Aggregation")
+    for effect in doc._.health_effects:
+        st.markdown(
+            format_effect(
+                f"{doc._.health_effects[effect]['effect']} effect on {effect}",
+                doc._.health_effects[effect]["effect"],
+            ),
+            unsafe_allow_html=True,
+        )
+        st.markdown("\n")
+    st.markdown("""---""")
+    # Indepth
+    st.markdown("## 🔧 Pipeline attributes")
+    clauses_col, effect_col = st.columns(2)
+    clauses_col.markdown("### doc._.clauses")
+    for clause in doc._.clauses:
+        clauses_col.json(clause)
+    effect_col.markdown("### doc._.health_effects")
+    effect_col.json(doc._.health_effects)