Spaces:

diegomrodrigues
/

bert-topic-gradio

Sleeping

App Files Files Community

diegomrodrigues commited on Jul 19, 2024

Commit

ed238a0

verified ·

1 Parent(s): 207f469

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +163 -171

app.py CHANGED Viewed

@@ -8,155 +8,120 @@ from bertopic import BERTopic
 from bertopic.representation import KeyBERTInspired
 from umap import UMAP
 import numpy as np
 from collections import defaultdict
 class CustomArxivLoader(ArxivLoader):
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
     def lazy_load(self) -> Iterator[Document]:
         documents = super().lazy_load()
-        def update_metadata(documents):
-            for document in documents:
-                yield Document(
-                    page_content=document.page_content,
-                    metadata={
-                        **document.metadata,
-                        "ArxivId": self.query,
-                        "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
-                    }
-                )
-        return update_metadata(documents)
-def upload_file(file):
-    if not ".json" in file.name:
-        return "Not Allowed"
-    print(f"Processing file: {file.name}")
-    with open(file.name, "r") as f:
         results = json.load(f)
     arxiv_urls = results["collected_urls"]["arxiv.org"]
-    print(f"Collected {len(arxiv_urls)} arxiv urls from file.")
-    arxiv_ids = map(lambda url: url.split("/")[-1].strip(".pdf"), arxiv_urls)
-    all_loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
-    merged_loader = MergedDataLoader(loaders=all_loaders)
-    documents = merged_loader.load()
-    print(f"Loaded {len(documents)} documents from file.")
-    return documents
-def process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
-    if not documents:
-        return "No documents to process. Please upload a file first."
-    contents = [doc.page_content for doc in documents]
     representation_model = KeyBERTInspired()
-    umap_model = UMAP(
-        n_neighbors=umap_n_neighbors,
-        n_components=umap_n_components,
-        min_dist=umap_min_dist,
-        metric='cosine'
-    )
-    topic_model = BERTopic(
         language="english",
         verbose=True,
         umap_model=umap_model,
-        min_topic_size=min_topic_size,
         representation_model=representation_model,
-        nr_topics=nr_topics
     )
     topics, _ = topic_model.fit_transform(contents)
     topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
-    print(f"Generated {len(topic_labels)} topics from data.")
-    print("Topic Labels: ", topic_labels)
-    return documents, topics.tolist() if isinstance(topics, np.ndarray) else topics, topic_labels
 def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
-    if not documents:
-        return []
-    results = []
-    for i, (doc, topic) in enumerate(zip(documents, topics)):
-        label = labels[topic]
-        results.append([str(i), label, doc.metadata['Title']])
-    return results
 def get_unique_topics(labels: List[str]) -> List[str]:
-    return list(set(labels))
-def remove_topics(documents: List[Document], topics: List[int], labels: List[str], topics_to_remove: List[str]) -> tuple:
-    new_documents = []
-    new_topics = []
-    new_labels = []
-    for doc, topic, label in zip(documents, topics, labels):
-        if label not in topics_to_remove:
-            new_documents.append(doc)
-            new_topics.append(topic)
-            new_labels.append(label)
-    return new_documents, new_topics, new_labels
-def create_markdown_content(documents: List[Document], labels: List[str]) -> str:
     if not documents or not labels:
         return "No data available for download."
     topic_documents = defaultdict(list)
-    for doc, label in zip(documents, labels):
         topic_documents[label].append(doc)
-    full_text = "# Arxiv Articles by Topic\n\n"
     for topic, docs in topic_documents.items():
-        full_text += f"## {topic}\n\n"
         for document in docs:
-            full_text += f"### {document.metadata['Title']}\n\n"
-            full_text += f"{document.metadata['Summary']}\n\n"
-    return full_text
-with gr.Blocks(theme="default") as demo:
-    gr.Markdown("# Bert Topic Article Organizer App")
-    gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
-    state = gr.State(value=[])
-    with gr.Row():
-        file_uploader = gr.UploadButton(
-            "Click to upload",
-            file_types=["json"],
-            file_count="single"
-        )
-        reprocess_button = gr.Button("Reprocess Documents")
-        download_button = gr.Button("Download Results")
-    with gr.Row():
-        with gr.Column():
-            umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
-            umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
-            umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
-        with gr.Column():
-            min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
-            nr_topics = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic nr_topics")
-    with gr.Row():
         output_matrix = gr.DataFrame(
             label="Processing Result",
             headers=["ID", "Topic", "Title"],
@@ -164,63 +129,90 @@ with gr.Blocks(theme="default") as demo:
             interactive=False
         )
-    with gr.Row():
-        topic_dropdown = gr.Dropdown(
-            label="Select Topics to Remove",
-            multiselect=True,
-            interactive=True
         )
-        remove_topics_button = gr.Button("Remove Selected Topics")
-    markdown_output = gr.File(label="Download Markdown", visible=False)
-    def update_ui(documents, topics, labels):
-        matrix = create_docs_matrix(documents, topics, labels)
-        unique_topics = get_unique_topics(labels)
-        return matrix, unique_topics
-    def process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
-        documents = state if state else []
-        new_documents, new_topics, new_labels = process_documents(documents, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
-        matrix, unique_topics = update_ui(new_documents, new_topics, new_labels)
-        return [new_documents, new_topics, new_labels], matrix, unique_topics
-    file_uploader.upload(
-        fn=lambda file: upload_file(file),
-        inputs=[file_uploader],
-        outputs=[state]
-    ).then(
-        fn=process_and_update,
-        inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
-        outputs=[state, output_matrix, topic_dropdown]
-    )
-    reprocess_button.click(
-        fn=process_and_update,
-        inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
-        outputs=[state, output_matrix, topic_dropdown]
-    )
-    def remove_and_update(state, topics_to_remove, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics):
-        documents, topics, labels = state
-        new_documents, new_topics, new_labels = remove_topics(documents, topics, labels, topics_to_remove)
-        return process_and_update([new_documents, new_topics, new_labels], umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics)
-    remove_topics_button.click(
-        fn=remove_and_update,
-        inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist, min_topic_size, nr_topics],
-        outputs=[state, output_matrix, topic_dropdown]
-    )
-    def create_download_file(state):
-        documents, _, labels = state
-        content = create_markdown_content(documents, labels)
-        return gr.File(value=content, visible=True, filename="arxiv_articles_by_topic.md")
-    download_button.click(
-        fn=create_download_file,
-        inputs=[state],
-        outputs=[markdown_output]
-    )
-demo.launch(share=True, show_error=True, max_threads=10, debug=True)

 from bertopic.representation import KeyBERTInspired
 from umap import UMAP
 import numpy as np
+import tempfile
 from collections import defaultdict
+# 1. Data Loading
 class CustomArxivLoader(ArxivLoader):
     def lazy_load(self) -> Iterator[Document]:
         documents = super().lazy_load()
+        for document in documents:
+            yield Document(
+                page_content=document.page_content,
+                metadata={
+                    **document.metadata,
+                    "ArxivId": self.query,
+                    "Source": f"https://arxiv.org/pdf/{self.query}.pdf"
+                }
+            )
+def load_documents_from_file(file_path: str) -> List[Document]:
+    with open(file_path, "r") as f:
         results = json.load(f)
     arxiv_urls = results["collected_urls"]["arxiv.org"]
+    arxiv_ids = [url.split("/")[-1].strip(".pdf") for url in arxiv_urls]
+    loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
+    merged_loader = MergedDataLoader(loaders=loaders)
+    return merged_loader.load()
+# 2. Topic Modeling
+def create_topic_model(umap_params: Dict, bertopic_params: Dict) -> BERTopic:
+    umap_model = UMAP(**umap_params)
     representation_model = KeyBERTInspired()
+    return BERTopic(
         language="english",
         verbose=True,
         umap_model=umap_model,
         representation_model=representation_model,
+        **bertopic_params
     )
+def process_documents(documents: List[Document], topic_model: BERTopic) -> tuple:
+    contents = [doc.page_content for doc in documents]
     topics, _ = topic_model.fit_transform(contents)
     topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
+    return topics, topic_labels
+# 3. Data Manipulation
 def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
+    return [
+        [str(i), labels[topic], doc.metadata['Title']]
+        for i, (doc, topic) in enumerate(zip(documents, topics))
+    ]
 def get_unique_topics(labels: List[str]) -> List[str]:
+    return sorted(set(labels))
+def remove_topics(state: Dict, topics_to_remove: List[str]) -> Dict:
+    documents, topics, labels = state['documents'], state['topics'], state['labels']
+    filtered_data = [
+        (doc, topic, label)
+        for doc, topic, label in zip(documents, topics, labels)
+        if label not in topics_to_remove
+    ]
+    new_documents, new_topics, new_labels = map(list, zip(*filtered_data)) if filtered_data else ([], [], [])
+    return {**state, 'documents': new_documents, 'topics': new_topics, 'labels': new_labels}
+# 4. Output Generation
+def create_markdown_content(state: Dict) -> str:
+    documents, topics, labels = state['documents'], state['topics'], state['labels']
     if not documents or not labels:
         return "No data available for download."
     topic_documents = defaultdict(list)
+    for doc, topic in zip(documents, topics):
+        label = labels[topic]
         topic_documents[label].append(doc)
+    content = ["# Arxiv Articles by Topic\n"]
     for topic, docs in topic_documents.items():
+        content.append(f"## {topic}\n")
         for document in docs:
+            content.append(f"### {document.metadata['Title']}")
+            content.append(f"{document.metadata['Summary']}")
+    return "\n".join(content)
+# 5. Gradio Interface
+def create_gradio_interface():
+    with gr.Blocks(theme="default") as demo:
+        gr.Markdown("# BERT Topic Article Organizer App")
+        gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
+        state = gr.State(value={})
+        with gr.Row():
+            file_uploader = gr.UploadButton("Click to upload", file_types=["json"], file_count="single")
+            reprocess_button = gr.Button("Reprocess Documents")
+            download_button = gr.Button("Download Results")
+        with gr.Row():
+            with gr.Column():
+                umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
+                umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
+                umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
+            with gr.Column():
+                min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
+                nr_topics = gr.Slider(minimum=1, maximum=100, value="auto", step=1, label="BERTopic nr_topics")
+                top_n_words = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="BERTopic top_n_words")
+                n_gram_range = gr.Slider(minimum=1, maximum=3, value=1, step=1, label="BERTopic n_gram_range")
+                calculate_probabilities = gr.Checkbox(label="Calculate Probabilities", value=False)
         output_matrix = gr.DataFrame(
             label="Processing Result",
             headers=["ID", "Topic", "Title"],
             interactive=False
         )
+        with gr.Row():
+            topic_dropdown = gr.Dropdown(label="Select Topics to Remove", multiselect=True, interactive=True)
+            remove_topics_button = gr.Button("Remove Selected Topics")
+        markdown_output = gr.File(label="Download Markdown")
+        def update_ui(state: Dict):
+            matrix = create_docs_matrix(state['documents'], state['topics'], state['labels'])
+            unique_topics = get_unique_topics(state['labels'])
+            return matrix, gr.Dropdown(choices=unique_topics, value=[]), unique_topics
+        def process_and_update(state: Dict, umap_n_neighbors: int, umap_n_components: int, umap_min_dist: float,
+                               min_topic_size: int, nr_topics: int, top_n_words: int, n_gram_range: int,
+                               calculate_probabilities: bool):
+            documents = state.get('documents', [])
+            umap_params = {
+                "n_neighbors": umap_n_neighbors,
+                "n_components": umap_n_components,
+                "min_dist": umap_min_dist
+            }
+            bertopic_params = {
+                "min_topic_size": min_topic_size,
+                "nr_topics": nr_topics,
+                "top_n_words": top_n_words,
+                "n_gram_range": (1, n_gram_range),
+                "calculate_probabilities": calculate_probabilities
+            }
+            topic_model = create_topic_model(umap_params, bertopic_params)
+            topics, labels = process_documents(documents, topic_model)
+            new_state = {**state, 'documents': documents, 'topics': topics, 'labels': labels}
+            matrix, dropdown, unique_topics = update_ui(new_state)
+            return new_state, matrix, dropdown, unique_topics
+        def load_and_process(file, umap_n_neighbors, umap_n_components, umap_min_dist,
+                             min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities):
+            documents = load_documents_from_file(file.name)
+            state = {'documents': documents}
+            return process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist,
+                                      min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)
+        file_uploader.upload(
+            fn=load_and_process,
+            inputs=[file_uploader, umap_n_neighbors, umap_n_components, umap_min_dist,
+                    min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
+            outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
         )
+        reprocess_button.click(
+            fn=process_and_update,
+            inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist,
+                    min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
+            outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
+        )
+        def remove_and_update(state: Dict, topics_to_remove: List[str], umap_n_neighbors: int, umap_n_components: int,
+                              umap_min_dist: float, min_topic_size: int, nr_topics: int, top_n_words: int,
+                              n_gram_range: int, calculate_probabilities: bool):
+            new_state = remove_topics(state, topics_to_remove)
+            return process_and_update(new_state, umap_n_neighbors, umap_n_components, umap_min_dist,
+                                      min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)
+        remove_topics_button.click(
+            fn=remove_and_update,
+            inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist,
+                    min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
+            outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
+        )
+        def create_download_file(state: Dict):
+            content = create_markdown_content(state)
+            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
+                temp_file.write(content)
+            return temp_file.name
+        download_button.click(
+            fn=create_download_file,
+            inputs=[state],
+            outputs=[markdown_output]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch(share=True, show_error=True, max_threads=10, debug=True)