import logging import os import time import gradio as gr import pandas as pd from pinecone import Pinecone from utils import ( get_zotero_ids, get_arxiv_papers, get_hf_embeddings, upload_to_pinecone, get_new_papers, recommend_papers, ) from dotenv import load_dotenv load_dotenv(".env") HF_API_KEY = os.getenv("HF_API_KEY") PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") INDEX_NAME = os.getenv("INDEX_NAME") NAMESPACE_NAME = os.getenv("NAMESPACE_NAME") script_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(script_dir) def category_radio(cat): if cat == "Computer Vision and Pattern Recognition": return "cs.CV" elif cat == "Computation and Language": return "cs.CL" elif cat == "Artificial Intelligence": return "cs.AI" elif cat == "Robotics": return "cs.RO" def comment_radio(com): if com == "None": return None else: return com def reset_project(): file_path = "arxiv-scrape.csv" if os.path.exists(file_path): os.remove(file_path) logging.info( f"{file_path} has been deleted. Delete reset_project() if you want to persist recommended papers." ) api_key = os.getenv("PINECONE_API_KEY") index = os.getenv("INDEX_NAME") pc = Pinecone(api_key=api_key) if index in pc.list_indexes().names(): pc.delete_index(index) logging.info( f"{index} index has been deleted from the vectordb. Delete reset_project() if you want to persist recommended papers." ) return f"{file_path} has been deleted.
{index} index has been deleted from the vectordb.
" def reset_csv(): file_path = "arxiv-scrape.csv" if os.path.exists(file_path): os.remove(file_path) logging.info( f"{file_path} has been deleted. Delete reset_project() if you want to persist recommended papers." ) with gr.Blocks() as demo: zotero_api_key = gr.Textbox( label="Zotero API Key", type="password", value=os.getenv("ZOTERO_API_KEY") ) zotero_library_id = gr.Textbox( label="Zotero Library ID", value=os.getenv("ZOTERO_LIBRARY_ID") ) zotero_tag = gr.Textbox(label="Zotero Tag", value=os.getenv("ZOTERO_TAG")) arxiv_category_name = gr.State([]) radio_arxiv_category_name = gr.Radio( [ "Computer Vision and Pattern Recognition", "Computation and Language", "Artificial Intelligence", "Robotics", ], value=["Computer Vision and Pattern Recognition"], label="ArXiv Category Query", ) radio_arxiv_category_name.change( fn=category_radio, inputs=radio_arxiv_category_name, outputs=arxiv_category_name ) arxiv_comment_query = gr.State([]) radio_arxiv_comment_query = gr.Radio( ["CVPR", "ACL", "TACL", "JAIR", "IJRR", "None"], value=["CVPR"], label="ArXiv Comment Query", ) radio_arxiv_comment_query.change( fn=comment_radio, inputs=radio_arxiv_comment_query, outputs=arxiv_comment_query ) threshold = gr.Slider( minimum=0.70, maximum=0.99, value=0.80, label="Similarity Score Threshold" ) init_output = gr.Textbox(label="Project Initialization Result") rec_output = gr.Markdown(label="Recommended Papers") reset_output = gr.Markdown(label="Reset Declaration") init_btn = gr.Button("Initialize") rec_btn = gr.Button("Recommend") reset_btn = gr.Button("Reset") reset_btn.click(fn=reset_project, inputs=[], outputs=[reset_output]) @init_btn.click( inputs=[zotero_api_key, zotero_library_id, zotero_tag], outputs=[init_output], trigger_mode="once", ) def init( zotero_api_key, zotero_library_id, zotero_tag, hf_api_key=HF_API_KEY, pinecone_api_key=PINECONE_API_KEY, index_name=INDEX_NAME, namespace_name=NAMESPACE_NAME, ): logging.basicConfig( filename="logfile.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logging.info("Project Initialization Script Started (Serverless)") ids = get_zotero_ids(zotero_api_key, zotero_library_id, zotero_tag) df = get_arxiv_papers(ids) embeddings, dim = get_hf_embeddings(hf_api_key, df) feedback = upload_to_pinecone( pinecone_api_key, index_name, namespace_name, embeddings, dim, df ) logging.info(feedback) if isinstance(feedback, dict): return f"Retrieved {len(ids)} papers from Zotero. Successfully upserted {feedback['upserted_count']} embeddings in {namespace_name} namespace." else: return feedback @rec_btn.click( inputs=[arxiv_category_name, arxiv_comment_query, threshold], outputs=[rec_output], trigger_mode="once", ) def recs( arxiv_category_name, arxiv_comment_query, threshold, hf_api_key=HF_API_KEY, pinecone_api_key=PINECONE_API_KEY, index_name=INDEX_NAME, namespace_name=NAMESPACE_NAME, ): logging.info("Weekly Script Started (Serverless)") df = get_arxiv_papers(category=arxiv_category_name, comment=arxiv_comment_query) df = get_new_papers(df) if not isinstance(df, pd.DataFrame): return df embeddings, _ = get_hf_embeddings(hf_api_key, df) results = recommend_papers( pinecone_api_key, index_name, namespace_name, embeddings, df, threshold * 3 ) return results csv_display = gr.DataFrame( label="ArXiv Scraped Papers", visible=True, show_label=False, interactive=False ) def update_csv(): while True: time.sleep(1) try: df = pd.read_csv("arxiv-scrape.csv") except: df = pd.DataFrame() yield df css = """ button:contains("Generate") { display: none !important; } """ gr.Interface( fn=update_csv, inputs=None, outputs=csv_display, clear_btn=None, live=True, allow_flagging="never", theme="default", css=css, ) demo.launch(share=True)