stephenleo commited on
Commit
5ab63e8
Β·
1 Parent(s): 0f69fb7

adding blog post

Browse files
app.py CHANGED
@@ -23,6 +23,8 @@ def load_data():
23
  """
24
 
25
  st.header('πŸ“‚ Load Data')
 
 
26
  uploaded_file = st.file_uploader("Choose a CSV file",
27
  help='Upload a CSV file with the following columns: Title, Abstract')
28
 
@@ -49,7 +51,7 @@ def load_data():
49
  st.write(f'Number of rows: {len(data)}')
50
  if len(data) > 500:
51
  data = data.sample(500, random_state=0)
52
- st.write(f'Only random 500 rows will be analyzed')
53
 
54
  data = data.reset_index(drop=True)
55
 
@@ -71,6 +73,8 @@ def topic_modeling(data):
71
  """
72
 
73
  st.header('πŸ”₯ Topic Modeling')
 
 
74
  cols = st.columns(3)
75
  with cols[0]:
76
  min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
@@ -118,6 +122,7 @@ def strip_network(data, topic_data, topics):
118
  """
119
 
120
  st.header('πŸš€ STriP Network')
 
121
 
122
  with st.spinner('Cosine Similarity Calculation'):
123
  cosine_sim_matrix = helpers.cosine_sim(data)
@@ -175,6 +180,7 @@ def network_centrality(nx_net, topic_data):
175
  """
176
 
177
  st.header('πŸ… Most Important Papers')
 
178
 
179
  centrality_mapping = {
180
  'Betweenness Centrality': nx.betweenness_centrality,
@@ -199,26 +205,18 @@ def network_centrality(nx_net, topic_data):
199
  st.plotly_chart(fig, use_container_width=True)
200
 
201
 
202
- def about_me():
203
- st.markdown(
204
- """
205
- πŸ’‘πŸ”₯πŸš€ STriP v1.0 πŸš€πŸ”₯πŸ’‘
206
-
207
- If you like this work, please consider ❀️ this HugginFace Space and ⭐ this Github repo [Link](https://github.com/stephenleo/stripnet)
208
-
209
- πŸ‘¨β€πŸ”¬ Author: Marie Stephen Leo
210
-
211
- πŸ’» Github: [stephenleo](https://github.com/stephenleo)
212
-
213
- πŸ‘” Linkedin: [Marie Stephen Leo](https://www.linkedin.com/in/marie-stephen-leo/)
214
 
215
- πŸ“ Medium: [@stephen-leo](https://stephen-leo.medium.com/)
216
- """
217
- )
218
 
219
 
220
  def main():
221
- st.title('STriP (S3P): Semantic Similarity of Scientific Papers!')
 
222
 
223
  logger.info('========== Step1: Loading data ==========')
224
  data, selected_cols = load_data()
@@ -233,7 +231,7 @@ def main():
233
  logger.info('========== Step4: Network Centrality ==========')
234
  network_centrality(nx_net, topic_data)
235
 
236
- about_me()
237
 
238
 
239
  if __name__ == '__main__':
 
23
  """
24
 
25
  st.header('πŸ“‚ Load Data')
26
+ read_md('markdown/load_data.md')
27
+
28
  uploaded_file = st.file_uploader("Choose a CSV file",
29
  help='Upload a CSV file with the following columns: Title, Abstract')
30
 
 
51
  st.write(f'Number of rows: {len(data)}')
52
  if len(data) > 500:
53
  data = data.sample(500, random_state=0)
54
+ st.write('Only random 500 rows will be analyzed')
55
 
56
  data = data.reset_index(drop=True)
57
 
 
73
  """
74
 
75
  st.header('πŸ”₯ Topic Modeling')
76
+ read_md('markdown/topic_modeling.md')
77
+
78
  cols = st.columns(3)
79
  with cols[0]:
80
  min_topic_size = st.slider('Minimum topic size', key='min_topic_size', min_value=2,
 
122
  """
123
 
124
  st.header('πŸš€ STriP Network')
125
+ read_md('markdown/stripnet.md')
126
 
127
  with st.spinner('Cosine Similarity Calculation'):
128
  cosine_sim_matrix = helpers.cosine_sim(data)
 
180
  """
181
 
182
  st.header('πŸ… Most Important Papers')
183
+ read_md('markdown/centrality.md')
184
 
185
  centrality_mapping = {
186
  'Betweenness Centrality': nx.betweenness_centrality,
 
205
  st.plotly_chart(fig, use_container_width=True)
206
 
207
 
208
+ def read_md(file_path):
209
+ """Reads a markdown file and returns the contents as a streamlit markdown component.
210
+ """
211
+ with open(file_path, 'r') as f:
212
+ content = st.markdown(f.read())
 
 
 
 
 
 
 
213
 
214
+ return content
 
 
215
 
216
 
217
  def main():
218
+ st.title('STriPNet: Semantic Similarity of Scientific Papers!')
219
+ read_md('markdown/about_stripnet.md')
220
 
221
  logger.info('========== Step1: Loading data ==========')
222
  data, selected_cols = load_data()
 
231
  logger.info('========== Step4: Network Centrality ==========')
232
  network_centrality(nx_net, topic_data)
233
 
234
+ read_md('markdown/about_me.md')
235
 
236
 
237
  if __name__ == '__main__':
helpers.py CHANGED
@@ -149,7 +149,7 @@ def network_plot(topic_data, topics, neighbors):
149
  {
150
  'group': row.Topic,
151
  'label': row.Index,
152
- 'title': row.Text,
153
  'size': 20, 'font': {'size': 20, 'color': 'white'}
154
  }
155
  )
 
149
  {
150
  'group': row.Topic,
151
  'label': row.Index,
152
+ 'title': text_processing(row.Text),
153
  'size': 20, 'font': {'size': 20, 'color': 'white'}
154
  }
155
  )
markdown/about_me.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+
3
+ πŸ’‘πŸ”₯πŸš€ STriPNet v1.0 πŸš€πŸ”₯πŸ’‘
4
+
5
+ If you like this work, please consider ❀️ this HugginFace Space and 🌟 the STriPNet Github repo [Link](https://github.com/stephenleo/stripnet)
6
+
7
+ πŸ‘¨β€πŸ”¬ Author: Marie Stephen Leo
8
+
9
+ πŸ’» Github: [stephenleo](https://github.com/stephenleo)
10
+
11
+ πŸ‘” Linkedin: [Marie Stephen Leo](https://www.linkedin.com/in/marie-stephen-leo/)
12
+
13
+ πŸ“ Medium: [@stephen-leo](https://stephen-leo.medium.com/)
14
+
15
+ ---
markdown/about_stripnet.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ - Do you read a lot of Scientific Papers?
2
+ - Have you ever wondered what are the overarching themes in the papers that you've read and how all the papers are semantically connected to one another?
3
+ - **Look no further!!!**
4
+ - Leverage the power of NLP Topic Modeling, Semantic Similarity, and Network analysis to study the themes and semantic relations within a corpus of research papers.
markdown/centrality.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ - Now that we've contructed our semantic similarity network, we can use Graph Theory on this network to compute the centrality of each node using [NetworkX](https://networkx.org/).
2
+ - The nodes with the highest centrality will be the most important nodes (papers) in our semantic network. [Centrality Wiki](https://en.wikipedia.org/wiki/Centrality)
3
+ - There are several types of centrality depending on the problem statement [Link](https://neo4j.com/developer/graph-data-science/centrality-graph-algorithms/).
4
+ - Since our goal is to find the most important papers that connect all the other papers in our corpus, we'll use [Betweenness Centrality](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.betweenness_centrality.html#networkx.algorithms.centrality.betweenness_centrality) by default. Feel free to test out other centrality algorithms using the `Select Centrality Measure` dropdown.
5
+ - The plot is generated using [Plotly](https://plotly.com/python/) and is fully interactive!
6
+ - Bars are ordered in descending order, so the most important paper in your corpus is the top bar.
7
+ - Colored by the topic the paper belongs to.
8
+ - Hover over any bar to view the title and truncated abstract of the paper
9
+ - Zoom in and out by clicking and dragging over the chart. Double click to zoom out.
10
+ - Click on the `Download plot as png` button on the top right hand side corner to download a .png file of your plot.
markdown/load_data.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ - STriPNet only accepts a `csv` file.
2
+ - Though STriPNet can work with any text column, it works best when two columns `Title` and `Abstract` (case insensitive) are provided.
3
+ - STriPNet automatically selects any column called `title` and `abstract` (case insensitive) that are present in your uploaded csv file. If your file doesnt contain any such column, you will need to manually select the text column(s) on which to run the analysis.
4
+ - If you select multiple columns, STriPNet internally concats the columns with a `[SEP]` keyword.
5
+ - Finally, to keep resource usage down, this πŸ€— Huggingface Spaces app will internally restrict to analysing the first 500 rows in the data that you upload. If you want to analyse more data, please check out (and 🌟 πŸ˜„) the [STriPNet Github repo](https://github.com/stephenleo/stripnet) to run it on your local! You can also directly download this HF Spaces app and run on your local.
markdown/stripnet.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - STriP Network plots the network of the semantically similar papers and how they are connected to each other.
2
+ - A network is a graph with nodes and edges. Each node here is one paper and the edges show the similarity between papers.
3
+ - Similarity is calculated by the cosine similarity between the [SPECTER](https://github.com/allenai/specter) embeddings of the two papers. The cosine similarity is a measure of the angle between two vectors.
4
+ - The `Cosine Similarity Threshold` parameter controls how similar two papers need to be, to get connected by an edge. Hover over the tooltip to get more information about it.
5
+ - The default value of the Cosine Similarity Threshold is heuristically calculated by STriPNet to give decent results. Feel free to play around with it until you are satisfied.
6
+ - The plot is generated using [PyVis](https://github.com/WestHealth/pyvis) with some customizations to [VisJS](https://visjs.github.io/vis-network/docs/network/) and is fully interactive!
7
+ - Hover on a node to see the paper's title and truncated abstract.
8
+ - Click on a node to see the edges that connect that node.
9
+ - Zoom in and out by scrolling.
10
+ - Click on any empty space and drag the plot to move and recenter it.
11
+ - Click on nodes and move them around to view them better.
12
+ - Click on the legend boxes and move them around. I like to place the legend boxes over the cluster of nodes of the same color!
13
+ - The number on the node is the row number of the input csv file on which this paper was located. Remember that python row numbers start from 0.
14
+ - Once you are happy with how your STriP Network plot looks, right click and save image to your local.
markdown/topic_modeling.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ - The first step in STriPNet is to run Topic Modeling using the [BERTopic](https://github.com/MaartenGr/BERTopic) library.
2
+ - BERTopic internally uses [Sentence Transformer](https://www.sbert.net/) models to convert text to embeddings, clusters them and extracts keywords from each cluster.
3
+ - Specifically, since STriPNet is intended to be used with scientific papers, we're using the [SPECTER](https://github.com/allenai/specter) pretrained sentence transformers model by Allen AI.
4
+ - The `Minimum topic size` and `N-gram range` parameters control the clustering and keyword extraction of BERTopic respectively. Hover over the tooltip of each parameter to get more information about them. STriPNet internally chooses some heuristically tuned parameters depending on the data you've uploaded. Feel free to play around with the parameters until you get good topics.
5
+ - You can visualize the quality of the topic modeling in various ways provided by the dropdown menu `Select Topic Modeling Visualization`.
6
+ - Finally, please take note that BERTopic results change with every run so the topics extracted might change everytime you run STriPNet even on the same data with the same settings. If your topics look weird, a simple page refresh or a $+1$ increase to `Minimum topic size` might fix it!