Spaces:
Runtime error
Runtime error
stephenleo
commited on
Commit
Β·
5ab63e8
1
Parent(s):
0f69fb7
adding blog post
Browse files- app.py +16 -18
- helpers.py +1 -1
- markdown/about_me.md +15 -0
- markdown/about_stripnet.md +4 -0
- markdown/centrality.md +10 -0
- markdown/load_data.md +5 -0
- markdown/stripnet.md +14 -0
- markdown/topic_modeling.md +6 -0
app.py
CHANGED
@@ -23,6 +23,8 @@ def load_data():
|
|
23 |
"""
|
24 |
|
25 |
st.header('π Load Data')
|
|
|
|
|
26 |
uploaded_file = st.file_uploader("Choose a CSV file",
|
27 |
help='Upload a CSV file with the following columns: Title, Abstract')
|
28 |
|
@@ -49,7 +51,7 @@ def load_data():
|
|
49 |
st.write(f'Number of rows: {len(data)}')
|
50 |
if len(data) > 500:
|
51 |
data = data.sample(500, random_state=0)
|
52 |
-
st.write(
|
53 |
|
54 |
data = data.reset_index(drop=True)
|
55 |
|
@@ -71,6 +73,8 @@ def topic_modeling(data):
|
|
71 |
"""
|
72 |
|
73 |
st.header('π₯ Topic Modeling')
|
|
|
|
|
74 |
cols = st.columns(3)
|
75 |
with cols[0]:
|
76 |
min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
|
@@ -118,6 +122,7 @@ def strip_network(data, topic_data, topics):
|
|
118 |
"""
|
119 |
|
120 |
st.header('π STriP Network')
|
|
|
121 |
|
122 |
with st.spinner('Cosine Similarity Calculation'):
|
123 |
cosine_sim_matrix = helpers.cosine_sim(data)
|
@@ -175,6 +180,7 @@ def network_centrality(nx_net, topic_data):
|
|
175 |
"""
|
176 |
|
177 |
st.header('π
Most Important Papers')
|
|
|
178 |
|
179 |
centrality_mapping = {
|
180 |
'Betweenness Centrality': nx.betweenness_centrality,
|
@@ -199,26 +205,18 @@ def network_centrality(nx_net, topic_data):
|
|
199 |
st.plotly_chart(fig, use_container_width=True)
|
200 |
|
201 |
|
202 |
-
def
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
If you like this work, please consider β€οΈ this HugginFace Space and β this Github repo [Link](https://github.com/stephenleo/stripnet)
|
208 |
-
|
209 |
-
π¨βπ¬ Author: Marie Stephen Leo
|
210 |
-
|
211 |
-
π» Github: [stephenleo](https://github.com/stephenleo)
|
212 |
-
|
213 |
-
π Linkedin: [Marie Stephen Leo](https://www.linkedin.com/in/marie-stephen-leo/)
|
214 |
|
215 |
-
|
216 |
-
"""
|
217 |
-
)
|
218 |
|
219 |
|
220 |
def main():
|
221 |
-
st.title('
|
|
|
222 |
|
223 |
logger.info('========== Step1: Loading data ==========')
|
224 |
data, selected_cols = load_data()
|
@@ -233,7 +231,7 @@ def main():
|
|
233 |
logger.info('========== Step4: Network Centrality ==========')
|
234 |
network_centrality(nx_net, topic_data)
|
235 |
|
236 |
-
about_me
|
237 |
|
238 |
|
239 |
if __name__ == '__main__':
|
|
|
23 |
"""
|
24 |
|
25 |
st.header('π Load Data')
|
26 |
+
read_md('markdown/load_data.md')
|
27 |
+
|
28 |
uploaded_file = st.file_uploader("Choose a CSV file",
|
29 |
help='Upload a CSV file with the following columns: Title, Abstract')
|
30 |
|
|
|
51 |
st.write(f'Number of rows: {len(data)}')
|
52 |
if len(data) > 500:
|
53 |
data = data.sample(500, random_state=0)
|
54 |
+
st.write('Only random 500 rows will be analyzed')
|
55 |
|
56 |
data = data.reset_index(drop=True)
|
57 |
|
|
|
73 |
"""
|
74 |
|
75 |
st.header('π₯ Topic Modeling')
|
76 |
+
read_md('markdown/topic_modeling.md')
|
77 |
+
|
78 |
cols = st.columns(3)
|
79 |
with cols[0]:
|
80 |
min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
|
|
|
122 |
"""
|
123 |
|
124 |
st.header('π STriP Network')
|
125 |
+
read_md('markdown/stripnet.md')
|
126 |
|
127 |
with st.spinner('Cosine Similarity Calculation'):
|
128 |
cosine_sim_matrix = helpers.cosine_sim(data)
|
|
|
180 |
"""
|
181 |
|
182 |
st.header('π
Most Important Papers')
|
183 |
+
read_md('markdown/centrality.md')
|
184 |
|
185 |
centrality_mapping = {
|
186 |
'Betweenness Centrality': nx.betweenness_centrality,
|
|
|
205 |
st.plotly_chart(fig, use_container_width=True)
|
206 |
|
207 |
|
208 |
+
def read_md(file_path):
|
209 |
+
"""Reads a markdown file and returns the contents as a streamlit markdown component.
|
210 |
+
"""
|
211 |
+
with open(file_path, 'r') as f:
|
212 |
+
content = st.markdown(f.read())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
+
return content
|
|
|
|
|
215 |
|
216 |
|
217 |
def main():
|
218 |
+
st.title('STriPNet: Semantic Similarity of Scientific Papers!')
|
219 |
+
read_md('markdown/about_stripnet.md')
|
220 |
|
221 |
logger.info('========== Step1: Loading data ==========')
|
222 |
data, selected_cols = load_data()
|
|
|
231 |
logger.info('========== Step4: Network Centrality ==========')
|
232 |
network_centrality(nx_net, topic_data)
|
233 |
|
234 |
+
read_md('markdown/about_me.md')
|
235 |
|
236 |
|
237 |
if __name__ == '__main__':
|
helpers.py
CHANGED
@@ -149,7 +149,7 @@ def network_plot(topic_data, topics, neighbors):
|
|
149 |
{
|
150 |
'group': row.Topic,
|
151 |
'label': row.Index,
|
152 |
-
'title': row.Text,
|
153 |
'size': 20, 'font': {'size': 20, 'color': 'white'}
|
154 |
}
|
155 |
)
|
|
|
149 |
{
|
150 |
'group': row.Topic,
|
151 |
'label': row.Index,
|
152 |
+
'title': text_processing(row.Text),
|
153 |
'size': 20, 'font': {'size': 20, 'color': 'white'}
|
154 |
}
|
155 |
)
|
markdown/about_me.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
|
3 |
+
π‘π₯π STriPNet v1.0 ππ₯π‘
|
4 |
+
|
5 |
+
If you like this work, please consider β€οΈ this HugginFace Space and π the STriPNet Github repo [Link](https://github.com/stephenleo/stripnet)
|
6 |
+
|
7 |
+
π¨βπ¬ Author: Marie Stephen Leo
|
8 |
+
|
9 |
+
π» Github: [stephenleo](https://github.com/stephenleo)
|
10 |
+
|
11 |
+
π Linkedin: [Marie Stephen Leo](https://www.linkedin.com/in/marie-stephen-leo/)
|
12 |
+
|
13 |
+
π Medium: [@stephen-leo](https://stephen-leo.medium.com/)
|
14 |
+
|
15 |
+
---
|
markdown/about_stripnet.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- Do you read a lot of Scientific Papers?
|
2 |
+
- Have you ever wondered what are the overarching themes in the papers that you've read and how all the papers are semantically connected to one another?
|
3 |
+
- **Look no further!!!**
|
4 |
+
- Leverage the power of NLP Topic Modeling, Semantic Similarity, and Network analysis to study the themes and semantic relations within a corpus of research papers.
|
markdown/centrality.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- Now that we've contructed our semantic similarity network, we can use Graph Theory on this network to compute the centrality of each node using [NetworkX](https://networkx.org/).
|
2 |
+
- The nodes with the highest centrality will be the most important nodes (papers) in our semantic network. [Centrality Wiki](https://en.wikipedia.org/wiki/Centrality)
|
3 |
+
- There are several types of centrality depending on the problem statement [Link](https://neo4j.com/developer/graph-data-science/centrality-graph-algorithms/).
|
4 |
+
- Since our goal is to find the most important papers that connect all the other papers in our corpus, we'll use [Betweenness Centrality](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality) by default. Feel free to test out other centrality algorithms using the `Select Centrality Measure` dropdown.
|
5 |
+
- The plot is generated using [Plotly](https://plotly.com/python/) and is fully interactive!
|
6 |
+
- Bars are ordered in descending order, so the most important paper in your corpus is the top bar.
|
7 |
+
- Colored by the topic the paper belongs to.
|
8 |
+
- Hover over any bar to view the title and truncated abstract of the paper
|
9 |
+
- Zoom in and out by clicking and dragging over the chart. Double click to zoom out.
|
10 |
+
- Click on the `Download plot as png` button on the top right hand side corner to download a .png file of your plot.
|
markdown/load_data.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- STriPNet only accepts a `csv` file.
|
2 |
+
- Though STriPNet can work with any text column, it works best when two columns `Title` and `Abstract` (case insensitive) are provided.
|
3 |
+
- STriPNet automatically selects any column called `title` and `abstract` (case insensitive) that are present in your uploaded csv file. If your file doesnt contain any such column, you will need to manually select the text column(s) on which to run the analysis.
|
4 |
+
- If you select multiple columns, STriPNet internally concats the columns with a `[SEP]` keyword.
|
5 |
+
- Finally, to keep resource usage down, this π€ Huggingface Spaces app will internally restrict to analysing the first 500 rows in the data that you upload. If you want to analyse more data, please check out (and π π) the [STriPNet Github repo](https://github.com/stephenleo/stripnet) to run it on your local! You can also directly download this HF Spaces app and run on your local.
|
markdown/stripnet.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- STriP Network plots the network of the semantically similar papers and how they are connected to each other.
|
2 |
+
- A network is a graph with nodes and edges. Each node here is one paper and the edges show the similarity between papers.
|
3 |
+
- Similarity is calculated by the cosine similarity between the [SPECTER](https://github.com/allenai/specter) embeddings of the two papers. The cosine similarity is a measure of the angle between two vectors.
|
4 |
+
- The `Cosine Similarity Threshold` parameter controls how similar two papers need to be, to get connected by an edge. Hover over the tooltip to get more information about it.
|
5 |
+
- The default value of the Cosine Similarity Threshold is heuristically calculated by STriPNet to give decent results. Feel free to play around with it until you are satisfied.
|
6 |
+
- The plot is generated using [PyVis](https://github.com/WestHealth/pyvis) with some customizations to [VisJS](https://visjs.github.io/vis-network/docs/network/) and is fully interactive!
|
7 |
+
- Hover on a node to see the paper's title and truncated abstract.
|
8 |
+
- Click on a node to see the edges that connect that node.
|
9 |
+
- Zoom in and out by scrolling.
|
10 |
+
- Click on any empty space and drag the plot to move and recenter it.
|
11 |
+
- Click on nodes and move them around to view them better.
|
12 |
+
- Click on the legend boxes and move them around. I like to place the legend boxes over the cluster of nodes of the same color!
|
13 |
+
- The number on the node is the row number of the input csv file on which this paper was located. Remember that python row numbers start from 0.
|
14 |
+
- Once you are happy with how your STriP Network plot looks, right click and save image to your local.
|
markdown/topic_modeling.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- The first step in STriPNet is to run Topic Modeling using the [BERTopic](https://github.com/MaartenGr/BERTopic) library.
|
2 |
+
- BERTopic internally uses [Sentence Transformer](https://www.sbert.net/) models to convert text to embeddings, clusters them and extracts keywords from each cluster.
|
3 |
+
- Specifically, since STriPNet is intended to be used with scientific papers, we're using the [SPECTER](https://github.com/allenai/specter) pretrained sentence transformers model by Allen AI.
|
4 |
+
- The `Minimum topic size` and `N-gram range` parameters control the clustering and keyword extraction of BERTopic respectively. Hover over the tooltip of each parameter to get more information about them. STriPNet internally chooses some heuristically tuned parameters depending on the data you've uploaded. Feel free to play around with the parameters until you get good topics.
|
5 |
+
- You can visualize the quality of the topic modeling in various ways provided by the dropdown menu `Select Topic Modeling Visualization`.
|
6 |
+
- Finally, please take note that BERTopic results change with every run so the topics extracted might change everytime you run STriPNet even on the same data with the same settings. If your topics look weird, a simple page refresh or a $+1$ increase to `Minimum topic size` might fix it!
|