diegomrodrigues commited on
Commit
ed238a0
·
verified ·
1 Parent(s): 207f469

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +163 -171
app.py CHANGED
@@ -8,155 +8,120 @@ from bertopic import BERTopic
8
  from bertopic.representation import KeyBERTInspired
9
  from umap import UMAP
10
  import numpy as np
 
11
  from collections import defaultdict
12
 
 
13
  class CustomArxivLoader(ArxivLoader):
14
- def __init__(self, **kwargs):
15
- super().__init__(**kwargs)
16
-
17
  def lazy_load(self) -> Iterator[Document]:
18
  documents = super().lazy_load()
19
-
20
- def update_metadata(documents):
21
- for document in documents:
22
- yield Document(
23
- page_content=document.page_content,
24
- metadata={
25
- **document.metadata,
26
- "ArxivId": self.query,
27
- "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
28
- }
29
- )
30
-
31
- return update_metadata(documents)
32
-
33
- def upload_file(file):
34
- if not ".json" in file.name:
35
- return "Not Allowed"
36
-
37
- print(f"Processing file: {file.name}")
38
-
39
- with open(file.name, "r") as f:
40
  results = json.load(f)
41
-
42
  arxiv_urls = results["collected_urls"]["arxiv.org"]
43
-
44
- print(f"Collected {len(arxiv_urls)} arxiv urls from file.")
45
-
46
- arxiv_ids = map(lambda url: url.split("/")[-1].strip(".pdf"), arxiv_urls)
47
-
48
- all_loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
49
-
50
- merged_loader = MergedDataLoader(loaders=all_loaders)
51
-
52
- documents = merged_loader.load()
53
-
54
- print(f"Loaded {len(documents)} documents from file.")
55
-
56
- return documents
57
-
58
- def process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
59
- if not documents:
60
- return "No documents to process. Please upload a file first."
61
 
62
- contents = [doc.page_content for doc in documents]
 
 
 
63
 
 
 
 
64
  representation_model = KeyBERTInspired()
65
-
66
- umap_model = UMAP(
67
- n_neighbors=umap_n_neighbors,
68
- n_components=umap_n_components,
69
- min_dist=umap_min_dist,
70
- metric='cosine'
71
- )
72
-
73
- topic_model = BERTopic(
74
  language="english",
75
  verbose=True,
76
  umap_model=umap_model,
77
- min_topic_size=min_topic_size,
78
  representation_model=representation_model,
79
- nr_topics=nr_topics
80
  )
81
 
 
 
82
  topics, _ = topic_model.fit_transform(contents)
83
-
84
  topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
 
 
85
 
86
- print(f"Generated {len(topic_labels)} topics from data.")
87
- print("Topic Labels: ", topic_labels)
88
-
89
- return documents, topics.tolist() if isinstance(topics, np.ndarray) else topics, topic_labels
90
-
91
  def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
92
- if not documents:
93
- return []
94
- results = []
95
- for i, (doc, topic) in enumerate(zip(documents, topics)):
96
- label = labels[topic]
97
- results.append([str(i), label, doc.metadata['Title']])
98
- return results
99
 
100
  def get_unique_topics(labels: List[str]) -> List[str]:
101
- return list(set(labels))
102
-
103
- def remove_topics(documents: List[Document], topics: List[int], labels: List[str], topics_to_remove: List[str]) -> tuple:
104
- new_documents = []
105
- new_topics = []
106
- new_labels = []
107
-
108
- for doc, topic, label in zip(documents, topics, labels):
109
- if label not in topics_to_remove:
110
- new_documents.append(doc)
111
- new_topics.append(topic)
112
- new_labels.append(label)
113
-
114
- return new_documents, new_topics, new_labels
115
-
116
- def create_markdown_content(documents: List[Document], labels: List[str]) -> str:
117
  if not documents or not labels:
118
  return "No data available for download."
119
 
120
  topic_documents = defaultdict(list)
121
- for doc, label in zip(documents, labels):
 
122
  topic_documents[label].append(doc)
123
 
124
- full_text = "# Arxiv Articles by Topic\n\n"
125
-
126
  for topic, docs in topic_documents.items():
127
- full_text += f"## {topic}\n\n"
128
-
129
  for document in docs:
130
- full_text += f"### {document.metadata['Title']}\n\n"
131
- full_text += f"{document.metadata['Summary']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- return full_text
134
-
135
- with gr.Blocks(theme="default") as demo:
136
- gr.Markdown("# Bert Topic Article Organizer App")
137
- gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
138
-
139
- state = gr.State(value=[])
140
-
141
- with gr.Row():
142
- file_uploader = gr.UploadButton(
143
- "Click to upload",
144
- file_types=["json"],
145
- file_count="single"
146
- )
147
- reprocess_button = gr.Button("Reprocess Documents")
148
- download_button = gr.Button("Download Results")
149
-
150
- with gr.Row():
151
- with gr.Column():
152
- umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
153
- umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
154
- umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
155
- with gr.Column():
156
- min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
157
- nr_topics = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic nr_topics")
158
-
159
- with gr.Row():
160
  output_matrix = gr.DataFrame(
161
  label="Processing Result",
162
  headers=["ID", "Topic", "Title"],
@@ -164,63 +129,90 @@ with gr.Blocks(theme="default") as demo:
164
  interactive=False
165
  )
166
 
167
- with gr.Row():
168
- topic_dropdown = gr.Dropdown(
169
- label="Select Topics to Remove",
170
- multiselect=True,
171
- interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  )
173
- remove_topics_button = gr.Button("Remove Selected Topics")
174
-
175
- markdown_output = gr.File(label="Download Markdown", visible=False)
176
-
177
- def update_ui(documents, topics, labels):
178
- matrix = create_docs_matrix(documents, topics, labels)
179
- unique_topics = get_unique_topics(labels)
180
- return matrix, unique_topics
181
-
182
- def process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
183
- documents = state if state else []
184
- new_documents, new_topics, new_labels = process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
185
- matrix, unique_topics = update_ui(new_documents, new_topics, new_labels)
186
- return [new_documents, new_topics, new_labels], matrix, unique_topics
187
-
188
- file_uploader.upload(
189
- fn=lambda file: upload_file(file),
190
- inputs=[file_uploader],
191
- outputs=[state]
192
- ).then(
193
- fn=process_and_update,
194
- inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
195
- outputs=[state, output_matrix, topic_dropdown]
196
- )
197
 
198
- reprocess_button.click(
199
- fn=process_and_update,
200
- inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
201
- outputs=[state, output_matrix, topic_dropdown]
202
- )
 
203
 
204
- def remove_and_update(state, topics_to_remove, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
205
- documents, topics, labels = state
206
- new_documents, new_topics, new_labels = remove_topics(documents, topics, labels, topics_to_remove)
207
- return process_and_update([new_documents, new_topics, new_labels], umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
 
 
 
 
 
 
 
 
 
208
 
209
- remove_topics_button.click(
210
- fn=remove_and_update,
211
- inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
212
- outputs=[state, output_matrix, topic_dropdown]
213
- )
214
 
215
- def create_download_file(state):
216
- documents, _, labels = state
217
- content = create_markdown_content(documents, labels)
218
- return gr.File(value=content, visible=True, filename="arxiv_articles_by_topic.md")
 
219
 
220
- download_button.click(
221
- fn=create_download_file,
222
- inputs=[state],
223
- outputs=[markdown_output]
224
- )
225
 
226
- demo.launch(share=True, show_error=True, max_threads=10, debug=True)
 
 
 
8
  from bertopic.representation import KeyBERTInspired
9
  from umap import UMAP
10
  import numpy as np
11
+ import tempfile
12
  from collections import defaultdict
13
 
14
+ # 1. Data Loading
15
  class CustomArxivLoader(ArxivLoader):
 
 
 
16
  def lazy_load(self) -> Iterator[Document]:
17
  documents = super().lazy_load()
18
+ for document in documents:
19
+ yield Document(
20
+ page_content=document.page_content,
21
+ metadata={
22
+ **document.metadata,
23
+ "ArxivId": self.query,
24
+ "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
25
+ }
26
+ )
27
+
28
+ def load_documents_from_file(file_path: str) -> List[Document]:
29
+ with open(file_path, "r") as f:
 
 
 
 
 
 
 
 
 
30
  results = json.load(f)
31
+
32
  arxiv_urls = results["collected_urls"]["arxiv.org"]
33
+ arxiv_ids = [url.split("/")[-1].strip(".pdf") for url in arxiv_urls]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
36
+ merged_loader = MergedDataLoader(loaders=loaders)
37
+
38
+ return merged_loader.load()
39
 
40
+ # 2. Topic Modeling
41
+ def create_topic_model(umap_params: Dict, bertopic_params: Dict) -> BERTopic:
42
+ umap_model = UMAP(**umap_params)
43
  representation_model = KeyBERTInspired()
44
+
45
+ return BERTopic(
 
 
 
 
 
 
 
46
  language="english",
47
  verbose=True,
48
  umap_model=umap_model,
 
49
  representation_model=representation_model,
50
+ **bertopic_params
51
  )
52
 
53
+ def process_documents(documents: List[Document], topic_model: BERTopic) -> tuple:
54
+ contents = [doc.page_content for doc in documents]
55
  topics, _ = topic_model.fit_transform(contents)
 
56
  topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
57
+
58
+ return topics, topic_labels
59
 
60
+ # 3. Data Manipulation
 
 
 
 
61
  def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
62
+ return [
63
+ [str(i), labels[topic], doc.metadata['Title']]
64
+ for i, (doc, topic) in enumerate(zip(documents, topics))
65
+ ]
 
 
 
66
 
67
  def get_unique_topics(labels: List[str]) -> List[str]:
68
+ return sorted(set(labels))
69
+
70
+ def remove_topics(state: Dict, topics_to_remove: List[str]) -> Dict:
71
+ documents, topics, labels = state['documents'], state['topics'], state['labels']
72
+ filtered_data = [
73
+ (doc, topic, label)
74
+ for doc, topic, label in zip(documents, topics, labels)
75
+ if label not in topics_to_remove
76
+ ]
77
+ new_documents, new_topics, new_labels = map(list, zip(*filtered_data)) if filtered_data else ([], [], [])
78
+ return {**state, 'documents': new_documents, 'topics': new_topics, 'labels': new_labels}
79
+
80
+ # 4. Output Generation
81
+ def create_markdown_content(state: Dict) -> str:
82
+ documents, topics, labels = state['documents'], state['topics'], state['labels']
 
83
  if not documents or not labels:
84
  return "No data available for download."
85
 
86
  topic_documents = defaultdict(list)
87
+ for doc, topic in zip(documents, topics):
88
+ label = labels[topic]
89
  topic_documents[label].append(doc)
90
 
91
+ content = ["# Arxiv Articles by Topic\n"]
 
92
  for topic, docs in topic_documents.items():
93
+ content.append(f"## {topic}\n")
 
94
  for document in docs:
95
+ content.append(f"### {document.metadata['Title']}")
96
+ content.append(f"{document.metadata['Summary']}")
97
+
98
+ return "\n".join(content)
99
+
100
+ # 5. Gradio Interface
101
+ def create_gradio_interface():
102
+ with gr.Blocks(theme="default") as demo:
103
+ gr.Markdown("# BERT Topic Article Organizer App")
104
+ gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
105
+
106
+ state = gr.State(value={})
107
+
108
+ with gr.Row():
109
+ file_uploader = gr.UploadButton("Click to upload", file_types=["json"], file_count="single")
110
+ reprocess_button = gr.Button("Reprocess Documents")
111
+ download_button = gr.Button("Download Results")
112
+
113
+ with gr.Row():
114
+ with gr.Column():
115
+ umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
116
+ umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
117
+ umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
118
+ with gr.Column():
119
+ min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
120
+ nr_topics = gr.Slider(minimum=1, maximum=100, value="auto", step=1, label="BERTopic nr_topics")
121
+ top_n_words = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="BERTopic top_n_words")
122
+ n_gram_range = gr.Slider(minimum=1, maximum=3, value=1, step=1, label="BERTopic n_gram_range")
123
+ calculate_probabilities = gr.Checkbox(label="Calculate Probabilities", value=False)
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  output_matrix = gr.DataFrame(
126
  label="Processing Result",
127
  headers=["ID", "Topic", "Title"],
 
129
  interactive=False
130
  )
131
 
132
+ with gr.Row():
133
+ topic_dropdown = gr.Dropdown(label="Select Topics to Remove", multiselect=True, interactive=True)
134
+ remove_topics_button = gr.Button("Remove Selected Topics")
135
+
136
+ markdown_output = gr.File(label="Download Markdown")
137
+
138
+ def update_ui(state: Dict):
139
+ matrix = create_docs_matrix(state['documents'], state['topics'], state['labels'])
140
+ unique_topics = get_unique_topics(state['labels'])
141
+ return matrix, gr.Dropdown(choices=unique_topics, value=[]), unique_topics
142
+
143
+ def process_and_update(state: Dict, umap_n_neighbors: int, umap_n_components: int, umap_min_dist: float,
144
+ min_topic_size: int, nr_topics: int, top_n_words: int, n_gram_range: int,
145
+ calculate_probabilities: bool):
146
+ documents = state.get('documents', [])
147
+ umap_params = {
148
+ "n_neighbors": umap_n_neighbors,
149
+ "n_components": umap_n_components,
150
+ "min_dist": umap_min_dist
151
+ }
152
+ bertopic_params = {
153
+ "min_topic_size": min_topic_size,
154
+ "nr_topics": nr_topics,
155
+ "top_n_words": top_n_words,
156
+ "n_gram_range": (1, n_gram_range),
157
+ "calculate_probabilities": calculate_probabilities
158
+ }
159
+
160
+ topic_model = create_topic_model(umap_params, bertopic_params)
161
+ topics, labels = process_documents(documents, topic_model)
162
+
163
+ new_state = {**state, 'documents': documents, 'topics': topics, 'labels': labels}
164
+ matrix, dropdown, unique_topics = update_ui(new_state)
165
+ return new_state, matrix, dropdown, unique_topics
166
+
167
+ def load_and_process(file, umap_n_neighbors, umap_n_components, umap_min_dist,
168
+ min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities):
169
+ documents = load_documents_from_file(file.name)
170
+ state = {'documents': documents}
171
+ return process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist,
172
+ min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)
173
+
174
+ file_uploader.upload(
175
+ fn=load_and_process,
176
+ inputs=[file_uploader, umap_n_neighbors, umap_n_components, umap_min_dist,
177
+ min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
178
+ outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
179
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ reprocess_button.click(
182
+ fn=process_and_update,
183
+ inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist,
184
+ min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
185
+ outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
186
+ )
187
 
188
+ def remove_and_update(state: Dict, topics_to_remove: List[str], umap_n_neighbors: int, umap_n_components: int,
189
+ umap_min_dist: float, min_topic_size: int, nr_topics: int, top_n_words: int,
190
+ n_gram_range: int, calculate_probabilities: bool):
191
+ new_state = remove_topics(state, topics_to_remove)
192
+ return process_and_update(new_state, umap_n_neighbors, umap_n_components, umap_min_dist,
193
+ min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)
194
+
195
+ remove_topics_button.click(
196
+ fn=remove_and_update,
197
+ inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist,
198
+ min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
199
+ outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
200
+ )
201
 
202
+ def create_download_file(state: Dict):
203
+ content = create_markdown_content(state)
204
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
205
+ temp_file.write(content)
206
+ return temp_file.name
207
 
208
+ download_button.click(
209
+ fn=create_download_file,
210
+ inputs=[state],
211
+ outputs=[markdown_output]
212
+ )
213
 
214
+ return demo
 
 
 
 
215
 
216
+ if __name__ == "__main__":
217
+ demo = create_gradio_interface()
218
+ demo.launch(share=True, show_error=True, max_threads=10, debug=True)