diegomrodrigues's picture
Upload folder using huggingface_hub
ed238a0 verified
import gradio as gr
import json
from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain_core.documents import Document
from typing import Iterator, List, Dict
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from umap import UMAP
import numpy as np
import tempfile
from collections import defaultdict
# 1. Data Loading
class CustomArxivLoader(ArxivLoader):
def lazy_load(self) -> Iterator[Document]:
documents = super().lazy_load()
for document in documents:
yield Document(
page_content=document.page_content,
metadata={
**document.metadata,
"ArxivId": self.query,
"Source": f"https://arxiv.org/pdf/{self.query}.pdf"
}
)
def load_documents_from_file(file_path: str) -> List[Document]:
with open(file_path, "r") as f:
results = json.load(f)
arxiv_urls = results["collected_urls"]["arxiv.org"]
arxiv_ids = [url.split("/")[-1].strip(".pdf") for url in arxiv_urls]
loaders = [CustomArxivLoader(query=arxiv_id) for arxiv_id in arxiv_ids]
merged_loader = MergedDataLoader(loaders=loaders)
return merged_loader.load()
# 2. Topic Modeling
def create_topic_model(umap_params: Dict, bertopic_params: Dict) -> BERTopic:
umap_model = UMAP(**umap_params)
representation_model = KeyBERTInspired()
return BERTopic(
language="english",
verbose=True,
umap_model=umap_model,
representation_model=representation_model,
**bertopic_params
)
def process_documents(documents: List[Document], topic_model: BERTopic) -> tuple:
contents = [doc.page_content for doc in documents]
topics, _ = topic_model.fit_transform(contents)
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, separator=' ')
return topics, topic_labels
# 3. Data Manipulation
def create_docs_matrix(documents: List[Document], topics: List[int], labels: List[str]) -> List[List[str]]:
return [
[str(i), labels[topic], doc.metadata['Title']]
for i, (doc, topic) in enumerate(zip(documents, topics))
]
def get_unique_topics(labels: List[str]) -> List[str]:
return sorted(set(labels))
def remove_topics(state: Dict, topics_to_remove: List[str]) -> Dict:
documents, topics, labels = state['documents'], state['topics'], state['labels']
filtered_data = [
(doc, topic, label)
for doc, topic, label in zip(documents, topics, labels)
if label not in topics_to_remove
]
new_documents, new_topics, new_labels = map(list, zip(*filtered_data)) if filtered_data else ([], [], [])
return {**state, 'documents': new_documents, 'topics': new_topics, 'labels': new_labels}
# 4. Output Generation
def create_markdown_content(state: Dict) -> str:
documents, topics, labels = state['documents'], state['topics'], state['labels']
if not documents or not labels:
return "No data available for download."
topic_documents = defaultdict(list)
for doc, topic in zip(documents, topics):
label = labels[topic]
topic_documents[label].append(doc)
content = ["# Arxiv Articles by Topic\n"]
for topic, docs in topic_documents.items():
content.append(f"## {topic}\n")
for document in docs:
content.append(f"### {document.metadata['Title']}")
content.append(f"{document.metadata['Summary']}")
return "\n".join(content)
# 5. Gradio Interface
def create_gradio_interface():
with gr.Blocks(theme="default") as demo:
gr.Markdown("# BERT Topic Article Organizer App")
gr.Markdown("Organizes arxiv articles in different topics and exports it in a zip file.")
state = gr.State(value={})
with gr.Row():
file_uploader = gr.UploadButton("Click to upload", file_types=["json"], file_count="single")
reprocess_button = gr.Button("Reprocess Documents")
download_button = gr.Button("Download Results")
with gr.Row():
with gr.Column():
umap_n_neighbors = gr.Slider(minimum=2, maximum=100, value=15, step=1, label="UMAP n_neighbors")
umap_n_components = gr.Slider(minimum=2, maximum=100, value=5, step=1, label="UMAP n_components")
umap_min_dist = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, step=0.01, label="UMAP min_dist")
with gr.Column():
min_topic_size = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="BERTopic min_topic_size")
nr_topics = gr.Slider(minimum=1, maximum=100, value="auto", step=1, label="BERTopic nr_topics")
top_n_words = gr.Slider(minimum=5, maximum=50, value=10, step=1, label="BERTopic top_n_words")
n_gram_range = gr.Slider(minimum=1, maximum=3, value=1, step=1, label="BERTopic n_gram_range")
calculate_probabilities = gr.Checkbox(label="Calculate Probabilities", value=False)
output_matrix = gr.DataFrame(
label="Processing Result",
headers=["ID", "Topic", "Title"],
col_count=(3, "fixed"),
interactive=False
)
with gr.Row():
topic_dropdown = gr.Dropdown(label="Select Topics to Remove", multiselect=True, interactive=True)
remove_topics_button = gr.Button("Remove Selected Topics")
markdown_output = gr.File(label="Download Markdown")
def update_ui(state: Dict):
matrix = create_docs_matrix(state['documents'], state['topics'], state['labels'])
unique_topics = get_unique_topics(state['labels'])
return matrix, gr.Dropdown(choices=unique_topics, value=[]), unique_topics
def process_and_update(state: Dict, umap_n_neighbors: int, umap_n_components: int, umap_min_dist: float,
min_topic_size: int, nr_topics: int, top_n_words: int, n_gram_range: int,
calculate_probabilities: bool):
documents = state.get('documents', [])
umap_params = {
"n_neighbors": umap_n_neighbors,
"n_components": umap_n_components,
"min_dist": umap_min_dist
}
bertopic_params = {
"min_topic_size": min_topic_size,
"nr_topics": nr_topics,
"top_n_words": top_n_words,
"n_gram_range": (1, n_gram_range),
"calculate_probabilities": calculate_probabilities
}
topic_model = create_topic_model(umap_params, bertopic_params)
topics, labels = process_documents(documents, topic_model)
new_state = {**state, 'documents': documents, 'topics': topics, 'labels': labels}
matrix, dropdown, unique_topics = update_ui(new_state)
return new_state, matrix, dropdown, unique_topics
def load_and_process(file, umap_n_neighbors, umap_n_components, umap_min_dist,
min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities):
documents = load_documents_from_file(file.name)
state = {'documents': documents}
return process_and_update(state, umap_n_neighbors, umap_n_components, umap_min_dist,
min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)
file_uploader.upload(
fn=load_and_process,
inputs=[file_uploader, umap_n_neighbors, umap_n_components, umap_min_dist,
min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
)
reprocess_button.click(
fn=process_and_update,
inputs=[state, umap_n_neighbors, umap_n_components, umap_min_dist,
min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
)
def remove_and_update(state: Dict, topics_to_remove: List[str], umap_n_neighbors: int, umap_n_components: int,
umap_min_dist: float, min_topic_size: int, nr_topics: int, top_n_words: int,
n_gram_range: int, calculate_probabilities: bool):
new_state = remove_topics(state, topics_to_remove)
return process_and_update(new_state, umap_n_neighbors, umap_n_components, umap_min_dist,
min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities)
remove_topics_button.click(
fn=remove_and_update,
inputs=[state, topic_dropdown, umap_n_neighbors, umap_n_components, umap_min_dist,
min_topic_size, nr_topics, top_n_words, n_gram_range, calculate_probabilities],
outputs=[state, output_matrix, topic_dropdown, topic_dropdown]
)
def create_download_file(state: Dict):
content = create_markdown_content(state)
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md") as temp_file:
temp_file.write(content)
return temp_file.name
download_button.click(
fn=create_download_file,
inputs=[state],
outputs=[markdown_output]
)
return demo
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(share=True, show_error=True, max_threads=10, debug=True)