Spaces:

chansung
/

paper_qa

Runtime error

App Files Files Community

chansung commited on Mar 9, 2024

Commit

7e4123a

1 Parent(s): 9ea7354

major update

Browse files

Files changed (12) hide show

app.py +92 -413
background.py +93 -0
constants/context.py +13 -0
constants/js.py +109 -10
constants/styles.py +97 -2
gen/gemini.py +1 -1
gen/gemini_chat.py +129 -0
gen/openllm.py +178 -0
init.py +100 -0
requirements.txt +5 -1
ui.py +264 -0
utils.py +5 -12

app.py CHANGED Viewed

@@ -1,80 +1,26 @@
-import os
-import re
-import copy
-import datasets
-import pandas as pd
 import gradio as gr
-from collections import defaultdict
-from datetime import datetime, timedelta
-from datasets import Dataset
-from huggingface_hub import HfApi
-from huggingface_hub import create_repo
-from huggingface_hub.utils import HfHubHTTPError
-import utils
-from paper.download import (
-    download_pdf_from_arxiv,
-    get_papers_from_hf_daily_papers,
-    get_papers_from_arxiv_ids
-)
-from paper.parser import extract_text_and_figures
-from gen.gemini import get_basic_qa, get_deep_qa
 from constants.styles import STYLE
-from constants.js import UPDATE_SEARCH_RESULTS, UPDATE_IF_TYPE
-from constants.utils import get_secrets
 from apscheduler.schedulers.background import BackgroundScheduler
-def count_nans(row):
-    count = 0
-    for _, (k, v) in enumerate(data.items()):
-        if v is None:
-            count = count + 1
-    return count
-gemini_api_key, hf_token, dataset_repo_id, request_arxiv_repo_id = get_secrets()
-ds = datasets.load_dataset(dataset_repo_id)
-request_ds = datasets.load_dataset(request_arxiv_repo_id)
-requested_arxiv_ids = []
-for request_d in request_ds['train']:
-    arxiv_ids = request_d['Requested arXiv IDs']
-    requested_arxiv_ids = requested_arxiv_ids + arxiv_ids
-requested_arxiv_ids_df = pd.DataFrame({'Requested arXiv IDs': requested_arxiv_ids})
-title2qna = {}
-date2qna = {}
-date_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
-for data in ds["train"]:
-    date = data["target_date"].strftime("%Y-%m-%d")
-    if date in date2qna:
-        papers = copy.deepcopy(date2qna[date])
-        for paper in papers:
-            if paper["title"] == data["title"]:
-                if count_nans(paper) > count_nans(data):
-                    date2qna[date].remove(paper)
-        date2qna[date].append(data)
-        del papers
-    else:
-        date2qna[date] = [data]
-for date in date2qna:
-    year, month, day = date.split("-")
-    papers = date2qna[date]
-    for paper in papers:
-        title2qna[paper["title"]] = paper
-        date_dict[year][month][day].append(paper)
-titles = title2qna.keys()
-sorted_dates = sorted(date2qna.keys())
 sorted_year = sorted(date_dict.keys())
 last_year = sorted_year[-1]
@@ -85,301 +31,28 @@ last_day = sorted_day[-1]
 last_papers = date_dict[last_year][last_month][last_day]
 selected_paper = last_papers[0]
-def filter_function(example, ids):
-    ids_e = example['Requested arXiv IDs']
-    for iid in ids:
-        if iid in ids_e:
-            ids_e.remove(iid)
-            example['Requested arXiv IDs'] = ids_e
-    print(example)
-    return example
-def process_arxiv_ids(gemini_api, hf_repo_id, req_hf_repo_id, hf_token, how_many=10):
-    arxiv_ids = []
-    ds1 = datasets.load_dataset(req_hf_repo_id)
-    for d in ds1['train']:
-        req_arxiv_ids = d['Requested arXiv IDs']
-        if len(req_arxiv_ids) > 0 and req_arxiv_ids[0] != "top":
-            arxiv_ids = arxiv_ids + req_arxiv_ids
-    arxiv_ids = arxiv_ids[:how_many]
-    if arxiv_ids is not None and len(arxiv_ids) > 0:
-        print(f"1. Get metadata for the papers [{arxiv_ids}]")
-        papers = get_papers_from_arxiv_ids(arxiv_ids)
-        print("...DONE")
-        print("2. Generating QAs for the paper")
-        for paper in papers:
-            try:
-                title = paper['title']
-                target_date = paper['target_date']
-                abstract = paper['paper']['summary']
-                arxiv_id = paper['paper']['id']
-                authors = paper['paper']['authors']
-                print(f"...PROCESSING ON[{arxiv_id}, {title}]")
-                print(f"......Downloading the paper PDF")
-                filename = download_pdf_from_arxiv(arxiv_id)
-                print(f"......DONE")
-                print(f"......Extracting text and figures")
-                texts, figures = extract_text_and_figures(filename)
-                text =' '.join(texts)
-                print(f"......DONE")
-                print(f"......Generating the seed(basic) QAs")
-                qnas = get_basic_qa(text, gemini_api_key=gemini_api, trucate=30000)
-                qnas['title'] = title
-                qnas['abstract'] = abstract
-                qnas['authors'] = ','.join(authors)
-                qnas['arxiv_id'] = arxiv_id
-                qnas['target_date'] = target_date
-                qnas['full_text'] = text
-                print(f"......DONE")
-                print(f"......Generating the follow-up QAs")
-                qnas = get_deep_qa(text, qnas, gemini_api_key=gemini_api, trucate=30000)
-                del qnas["qna"]
-                print(f"......DONE")
-                print(f"......Exporting to HF Dataset repo at [{hf_repo_id}]")
-                utils.push_to_hf_hub(qnas, hf_repo_id, hf_token)
-                print(f"......DONE")
-                print(f"......Updating request arXiv HF Dataset repo at [{req_hf_repo_id}]")
-                ds1 = ds1['train'].map(
-                    lambda example: filter_function(example, [arxiv_id])
-                ).filter(
-                    lambda example: len(example['Requested arXiv IDs']) > 0
-                )
-                ds1.push_to_hub(req_hf_repo_id, token=hf_token)
-                print(f"......DONE")
-            except Exception as e:
-                print(f".......failed due to exception {e}")
-                continue
-        HfApi(token=hf_token).restart_space(
-            repo_id="chansung/paper_qa", token=hf_token
-        )
-def push_to_hf_hub(
-    df, repo_id, token, append=True
-):
-    exist = False
-    ds = Dataset.from_pandas(df)
-    try:
-        create_repo(request_arxiv_repo_id, repo_type="dataset", token=hf_token)
-    except HfHubHTTPError as e:
-        exist = True
-    if exist and append:
-        existing_ds = datasets.load_dataset(repo_id)
-        ds = datasets.concatenate_datasets([existing_ds['train'], ds])
-    ds.push_to_hub(repo_id, token=token)
-def _filter_duplicate_arxiv_ids(arxiv_ids_to_be_added):
-    ds1 = datasets.load_dataset("chansung/requested-arxiv-ids-3")
-    ds2 = datasets.load_dataset("chansung/auto-paper-qa2")
-    unique_arxiv_ids = set()
-    for d in ds1['train']:
-        arxiv_ids = d['Requested arXiv IDs']
-        unique_arxiv_ids = set(list(unique_arxiv_ids) + arxiv_ids)
-    for d in ds2['train']:
-        arxiv_id = d['arxiv_id']
-        unique_arxiv_ids.add(arxiv_id)
-    return list(set(arxiv_ids_to_be_added) - unique_arxiv_ids)
-def _is_arxiv_id_valid(arxiv_id):
-  pattern = r"^\d{4}\.\d{5}$"
-  return bool(re.match(pattern, arxiv_id))
-def _get_valid_arxiv_ids(arxiv_ids_str):
-    valid_arxiv_ids = []
-    invalid_arxiv_ids = []
-    for arxiv_id in arxiv_ids_str.split(","):
-        arxiv_id = arxiv_id.strip()
-        if _is_arxiv_id_valid(arxiv_id):
-           valid_arxiv_ids.append(arxiv_id)
-        else:
-            invalid_arxiv_ids.append(arxiv_id)
-    return valid_arxiv_ids, invalid_arxiv_ids
-def add_arxiv_ids_to_queue(queue, arxiv_ids_str):
-    print(0)
-    valid_arxiv_ids, invalid_arxiv_ids = _get_valid_arxiv_ids(arxiv_ids_str)
-    print("01")
-    if len(invalid_arxiv_ids) > 0:
-        gr.Warning(f"found invalid arXiv ids as in {invalid_arxiv_ids}")
-    if len(valid_arxiv_ids) > 0:
-        valid_arxiv_ids = _filter_duplicate_arxiv_ids(valid_arxiv_ids)
-        if len(valid_arxiv_ids) > 0:
-            valid_arxiv_ids = [[arxiv_id] for arxiv_id in valid_arxiv_ids]
-            gr.Warning(f"Processing on [{valid_arxiv_ids}]. Other requested arXiv IDs not found on this list should be already processed or being processed...")
-            valid_arxiv_ids = pd.DataFrame({'Requested arXiv IDs': valid_arxiv_ids})
-            queue = pd.concat([queue, valid_arxiv_ids])
-            queue.reset_index(drop=True)
-            push_to_hf_hub(valid_arxiv_ids, request_arxiv_repo_id, hf_token)
-        else:
-            gr.Warning(f"All requested arXiv IDs are already processed or being processed...")
-    else:
-        gr.Warning(f"No valid arXiv IDs found...")
-    return (
-        queue, gr.Textbox("")
-    )
-def get_paper_by_year(y):
-    m = sorted(date_dict[y].keys())
-    last_m = m[-1]
-    d = sorted(date_dict[y][last_m].keys())
-    last_d = d[-1]
-    papers = [paper["title"] for paper in date_dict[y][last_m][last_d]]
-    papers = list(set(papers))
-    return (
-        gr.Dropdown(choices=m, value=last_m),
-        gr.Dropdown(choices=d, value=last_d),
-        gr.Dropdown(choices=papers, value=papers[0])
-    )
-def get_paper_by_month(y, m):
-    d = sorted(date_dict[y][m].keys())
-    last_d = d[-1]
-    papers = [paper["title"] for paper in date_dict[y][m][last_d]]
-    papers = list(set(papers))
-    return (
-        gr.Dropdown(choices=d, value=last_d),
-        gr.Dropdown(choices=papers, value=papers[0])
-    )
-def get_paper_by_day(y, m, d):
-    papers = [paper["title"] for paper in date_dict[y][m][d]]
-    papers = list(set(papers))
-    return gr.Dropdown(choices=papers, value=papers[0])
-def set_paper(y, m, d, paper_title):
-    selected_paper = None
-    for paper in date_dict[y][m][d]:
-        if paper["title"] == paper_title:
-            selected_paper = paper
-            break
-    return (
-        gr.Markdown(f"# {selected_paper['title']}"),
-        gr.Markdown(
-            "[![arXiv](https://img.shields.io/badge/arXiv-%s-b31b1b.svg)](https://arxiv.org/abs/%s)" % (selected_paper['arxiv_id'], selected_paper['arxiv_id'])
-        ),
-        gr.Markdown(
-            "[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/%s)" % selected_paper['arxiv_id']
-        ),
-        gr.Markdown(selected_paper["summary"]),
-        gr.Markdown(f"### 🙋 {selected_paper['0_question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['0_answers:expert']}"),
-        gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_depth_q:follow up question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"),
-        gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_breath_q:follow up question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"),
-        gr.Markdown(f"### 🙋 {selected_paper['1_question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['1_answers:expert']}"),
-        gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_depth_q:follow up question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"),
-        gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_breath_q:follow up question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"),
-        gr.Markdown(f"### 🙋 {selected_paper['2_question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['2_answers:expert']}"),
-        gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_depth_q:follow up question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"),
-        gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_breath_q:follow up question']}"),
-        gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"),
-        gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"),
-    )
-def change_exp_type(exp_type):
-    if exp_type == "ELI5":
-        return (
-            gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False),
-            gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False),
-            gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False),
-        )
-    else:
-        return (
-            gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True),
-            gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True),
-            gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True),
-        )
-def search(search_in, max_results=3):
-    results = []
-    for title in titles:
-        if len(results) > 3:
-            break
-        else:
-            if search_in in title:
-                results.append(title)
-    return (
-        gr.Textbox(
-            visible=True if len(results) > 0 else False,
-            value=results[0] if len(results) > 0 else ""
-        ),
-        gr.Textbox(
-            visible=True if len(results) > 1 else False,
-            value=results[1] if len(results) > 1 else ""
-        ),
-        gr.Textbox(
-            visible=True if len(results) > 2 else False,
-            value=results[2] if len(results) > 2 else ""
-        )
-    )
-def set_date(title):
-    for _, (year, months) in enumerate(date_dict.items()):
-        for _, (month, days) in enumerate(months.items()):
-            for _, (day, papers) in enumerate(days.items()):
-                for paper in papers:
-                    if paper['title'] == title:
-                        return (
-                            gr.Dropdown(value=year),
-                            gr.Dropdown(choices=sorted(months), value=month),
-                            gr.Dropdown(choices=sorted(days), value=day),
-                        )
-def set_papers(y, m, d, title):
-    papers = [paper["title"] for paper in date_dict[y][m][d]]
-    papers = list(set(papers))
-    return (
-        gr.Dropdown(choices=papers, value=title),
-        gr.Textbox("")
-    )
-with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
     gr.Markdown("# Let's explore papers with auto generated Q&As")
     with gr.Column(elem_id="control-panel", elem_classes=["group"]):
@@ -410,25 +83,22 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
             search_r9 = gr.Button(visible=False, elem_id="search_r9", elem_classes=["no-radius"])
             search_r10 = gr.Button(visible=False, elem_id="search_r10", elem_classes=["no-radius"])
-        conv_type = gr.Radio(choices=["Q&As", "Chat"], value="Q&As", interactive=True, visible=False, elem_classes=["conv-type"])
     with gr.Column(scale=7):
-        title = gr.Markdown(f"# {selected_paper['title']}")
         # with gr.Row():
         with gr.Row():
             arxiv_link = gr.Markdown(
-                "[![arXiv](https://img.shields.io/badge/arXiv-%s-b31b1b.svg)](https://arxiv.org/abs/%s)" % (selected_paper['arxiv_id'], selected_paper['arxiv_id'])
             )
             hf_paper_link = gr.Markdown(
-                "[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-md.svg)](https://huggingface.co/papers/%s)" % selected_paper['arxiv_id']
             )
-            gr.Button("Chat about the paper", interactive=False)
         summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"])
-        with gr.Column(elem_id="chat_block", visible=False):
-            gr.Chatbot([("hello", "world"), ("how", "are you?")])
         with gr.Column(elem_id="qna_block", visible=True):
             with gr.Row():
                 with gr.Column(scale=7):
@@ -489,7 +159,7 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
             headers=["Requested arXiv IDs"], col_count=(1, "fixed"),
             value=requested_arxiv_ids_df,
             datatype=["str"],
-            interactive=False
         )
         arxiv_id_enter = gr.Textbox(placeholder="Enter comma separated arXiv IDs...", elem_classes=["textbox-no-label"])
@@ -508,72 +178,68 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
     search_r1.click(set_date, search_r1, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r1],
-        outputs=[papers_dd, search_in]
     )
     search_r2.click(set_date, search_r2, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r2],
-        outputs=[papers_dd, search_in]
     )
     search_r3.click(set_date, search_r3, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r3],
-        outputs=[papers_dd, search_in]
     )
     search_r4.click(set_date, search_r4, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r4],
-        outputs=[papers_dd, search_in]
     )
     search_r5.click(set_date, search_r5, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r5],
-        outputs=[papers_dd, search_in]
     )
     search_r6.click(set_date, search_r6, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r6],
-        outputs=[papers_dd, search_in]
     )
     search_r7.click(set_date, search_r7, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r7],
-        outputs=[papers_dd, search_in]
     )
     search_r8.click(set_date, search_r8, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r8],
-        outputs=[papers_dd, search_in]
     )
     search_r9.click(set_date, search_r9, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r9],
-        outputs=[papers_dd, search_in]
     )
     search_r10.click(set_date, search_r10, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r10],
-        outputs=[papers_dd, search_in]
     )
-    year_dd.input(
-        get_paper_by_year,
-        inputs=[year_dd],
-        outputs=[month_dd, day_dd, papers_dd]
-    ).then(
-        set_paper,
-        [year_dd, month_dd, day_dd, papers_dd],
         [
-            title, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
             breath_q_0, breath_q_eli5_0, breath_q_expert_0,
@@ -588,14 +254,10 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
         ]
     )
-    month_dd.input(
-        get_paper_by_month,
-        inputs=[year_dd, month_dd],
-        outputs=[day_dd, papers_dd]
-    ).then(
-        set_paper,
-        [year_dd, month_dd, day_dd, papers_dd],
         [
             title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
@@ -611,14 +273,10 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
         ]
     )
-    day_dd.input(
-        get_paper_by_day,
-        inputs=[year_dd, month_dd, day_dd],
-        outputs=[papers_dd]
-    ).then(
-        set_paper,
-        [year_dd, month_dd, day_dd, papers_dd],
         [
             title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
@@ -634,10 +292,9 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
         ]
     )
-    papers_dd.change(
-        set_paper,
-        [year_dd, month_dd, day_dd, papers_dd],
         [
             title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
@@ -672,14 +329,35 @@ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
             basic_q_eli5_2, basic_q_expert_2, depth_q_eli5_2, depth_q_expert_2, breath_q_eli5_2, breath_q_expert_2
         ]
     )
-    conv_type.select(
-        inputs=[conv_type],
-        js=UPDATE_IF_TYPE,
-        outputs=None,
-        fn=None
     )
 start_date = datetime.now() + timedelta(minutes=1)
 scheduler = BackgroundScheduler()
 scheduler.add_job(
@@ -690,7 +368,8 @@ scheduler.add_job(
         gemini_api_key,
         dataset_repo_id,
         request_arxiv_repo_id,
-        hf_token
     ],
     start_date=start_date
 )

 import gradio as gr
+from init import get_secrets, initialize_data, update_dataframe
+from gen.openllm import GradioLLaMA2ChatPPManager, GradioMistralChatPPManager
+from gen.gemini_chat import GradioGeminiChatPPManager
 from constants.styles import STYLE
+from constants.js import (
+    UPDATE_SEARCH_RESULTS, OPEN_CHAT_IF,
+    CLOSE_CHAT_IF, UPDATE_CHAT_HISTORY
+)
+from datetime import datetime, timedelta
+from background import process_arxiv_ids
 from apscheduler.schedulers.background import BackgroundScheduler
+gemini_api_key, hf_token, dataset_repo_id, request_arxiv_repo_id, restart_repo_id = get_secrets()
+titles, date_dict, requested_arxiv_ids_df, arxivid2data = initialize_data(dataset_repo_id, request_arxiv_repo_id)
+from ui import (
+    get_paper_by_year, get_paper_by_month, get_paper_by_day,
+    set_papers, set_paper, set_date, change_exp_type, add_arxiv_ids_to_queue,
+    before_chat_begin, chat_stream, chat_reset
+)
 sorted_year = sorted(date_dict.keys())
 last_year = sorted_year[-1]
 last_papers = date_dict[last_year][last_month][last_day]
 selected_paper = last_papers[0]
+with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
+    cur_arxiv_id = gr.Textbox(selected_paper['arxiv_id'], visible=False)
+    local_data = gr.JSON({}, visible=False)
+    chat_state = gr.State({
+        "ppmanager_type": GradioGeminiChatPPManager # GradioMistralChatPPManager # GradioLLaMA2ChatPPManager
+    })
+    with gr.Column(elem_id="chatbot-back"):
+        with gr.Column(elem_id="chatbot", elem_classes=["hover-opacity"]):
+            close = gr.Button("𝕏", elem_id="chatbot-right-button") #elem_id="chatbot-right-button")
+            chatbot = gr.Chatbot(
+                label="Gemini 1.0 Pro", show_label=True,
+                show_copy_button=True, show_share_button=True,
+                visible=True, elem_id="chatbot-inside"
+            )
+            with gr.Row(elem_id="chatbot-bottm"):
+                reset = gr.Button("🗑️ Reset")
+                regen = gr.Button("🔄 Regenerate", visible=False)
+            prompt_txtbox = gr.Textbox(placeholder="Ask anything.....", elem_id="chatbot-txtbox", elem_classes=["textbox-no-label"])
     gr.Markdown("# Let's explore papers with auto generated Q&As")
     with gr.Column(elem_id="control-panel", elem_classes=["group"]):
             search_r9 = gr.Button(visible=False, elem_id="search_r9", elem_classes=["no-radius"])
             search_r10 = gr.Button(visible=False, elem_id="search_r10", elem_classes=["no-radius"])
     with gr.Column(scale=7):
+        title = gr.Markdown(f"# {selected_paper['title']}", elem_classes=["markdown-center"])
         # with gr.Row():
         with gr.Row():
             arxiv_link = gr.Markdown(
+                "[![arXiv](https://img.shields.io/badge/arXiv-%s-b31b1b.svg?style=for-the-badge)](https://arxiv.org/abs/%s)" % (selected_paper['arxiv_id'], selected_paper['arxiv_id']),
+                elem_classes=["markdown-center"]
             )
             hf_paper_link = gr.Markdown(
+                "[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-lg.svg)](https://huggingface.co/papers/%s)" % selected_paper['arxiv_id'],
+                elem_classes=["markdown-center"]
             )
+            chat_button = gr.Button("💬 about paper", interactive=True, elem_id="chat-button")
         summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"])
         with gr.Column(elem_id="qna_block", visible=True):
             with gr.Row():
                 with gr.Column(scale=7):
             headers=["Requested arXiv IDs"], col_count=(1, "fixed"),
             value=requested_arxiv_ids_df,
             datatype=["str"],
+            interactive=False,
         )
         arxiv_id_enter = gr.Textbox(placeholder="Enter comma separated arXiv IDs...", elem_classes=["textbox-no-label"])
     search_r1.click(set_date, search_r1, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r1],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r2.click(set_date, search_r2, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r2],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r3.click(set_date, search_r3, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r3],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r4.click(set_date, search_r4, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r4],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r5.click(set_date, search_r5, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r5],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r6.click(set_date, search_r6, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r6],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r7.click(set_date, search_r7, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r7],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r8.click(set_date, search_r8, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r8],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r9.click(set_date, search_r9, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r9],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
     search_r10.click(set_date, search_r10, [year_dd, month_dd, day_dd]).then(
         set_papers,
         inputs=[year_dd, month_dd, day_dd, search_r10],
+        outputs=[cur_arxiv_id, papers_dd, search_in]
     )
+    year_dd.input(get_paper_by_year, inputs=[year_dd], outputs=[month_dd, day_dd, papers_dd]).then(
+        set_paper, [year_dd, month_dd, day_dd, papers_dd],
         [
+            cur_arxiv_id,
+            title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
             breath_q_0, breath_q_eli5_0, breath_q_expert_0,
         ]
     )
+    month_dd.input(get_paper_by_month, inputs=[year_dd, month_dd], outputs=[day_dd, papers_dd]).then(
+        set_paper, [year_dd, month_dd, day_dd, papers_dd],
         [
+            cur_arxiv_id,
             title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
         ]
     )
+    day_dd.input(get_paper_by_day, inputs=[year_dd, month_dd, day_dd], outputs=[papers_dd]).then(
+        set_paper, [year_dd, month_dd, day_dd, papers_dd],
         [
+            cur_arxiv_id,
             title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
         ]
     )
+    papers_dd.change(set_paper, [year_dd, month_dd, day_dd, papers_dd],
         [
+            cur_arxiv_id,
             title, arxiv_link, hf_paper_link, summary,
             basic_q_0, basic_q_eli5_0, basic_q_expert_0,
             depth_q_0, depth_q_eli5_0, depth_q_expert_0,
             basic_q_eli5_2, basic_q_expert_2, depth_q_eli5_2, depth_q_expert_2, breath_q_eli5_2, breath_q_expert_2
         ]
     )
+    chat_button.click(None, [cur_arxiv_id], [local_data, chatbot], js=OPEN_CHAT_IF)
+    close.click(None, None, None,js=CLOSE_CHAT_IF)
+    prompt_txtbox.submit(
+        before_chat_begin, None, [close, reset, regen]
+    ).then(
+        chat_stream,
+        [cur_arxiv_id, local_data, prompt_txtbox, chat_state],
+        [prompt_txtbox, chatbot, local_data, close, reset, regen]
+    ).then(
+        None, [cur_arxiv_id, local_data], None,
+        js=UPDATE_CHAT_HISTORY
+    )
+    reset.click(
+        before_chat_begin, None, [close, reset, regen]
+    ).then(
+        chat_reset,
+        [local_data, chat_state],
+        [prompt_txtbox, chatbot, local_data, close, reset, regen]
+    ).then(
+        None, [cur_arxiv_id, local_data], None,
+        js=UPDATE_CHAT_HISTORY
     )
+    demo.load(lambda: update_dataframe(request_arxiv_repo_id), None, arxiv_queue, every=180)
+    # demo.load(None, None, [chatbot, local_data], js=GET_LOCAL_STORAGE % idx.value)
 start_date = datetime.now() + timedelta(minutes=1)
 scheduler = BackgroundScheduler()
 scheduler.add_job(
         gemini_api_key,
         dataset_repo_id,
         request_arxiv_repo_id,
+        hf_token,
+        restart_repo_id
     ],
     start_date=start_date
 )

background.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import datasets
+import pandas as pd
+from huggingface_hub import HfApi
+from utils import push_to_hf_hub
+from paper.download import download_pdf_from_arxiv
+from paper.download import get_papers_from_arxiv_ids
+from paper.parser import extract_text_and_figures
+from gen.gemini import get_basic_qa, get_deep_qa
+def _filter_function(example, ids):
+    ids_e = example['Requested arXiv IDs']
+    for iid in ids:
+        if iid in ids_e:
+            ids_e.remove(iid)
+            example['Requested arXiv IDs'] = ids_e
+    print(example)
+    return example
+def process_arxiv_ids(gemini_api, hf_repo_id, req_hf_repo_id, hf_token, restart_repo_id, how_many=10):
+    arxiv_ids = []
+    ds1 = datasets.load_dataset(req_hf_repo_id)
+    for d in ds1['train']:
+        req_arxiv_ids = d['Requested arXiv IDs']
+        if len(req_arxiv_ids) > 0 and req_arxiv_ids[0] != "top":
+            arxiv_ids = arxiv_ids + req_arxiv_ids
+    arxiv_ids = arxiv_ids[:how_many]
+    if arxiv_ids is not None and len(arxiv_ids) > 0:
+        print(f"1. Get metadata for the papers [{arxiv_ids}]")
+        papers = get_papers_from_arxiv_ids(arxiv_ids)
+        print("...DONE")
+        print("2. Generating QAs for the paper")
+        for paper in papers:
+            try:
+                title = paper['title']
+                target_date = paper['target_date']
+                abstract = paper['paper']['summary']
+                arxiv_id = paper['paper']['id']
+                authors = paper['paper']['authors']
+                print(f"...PROCESSING ON[{arxiv_id}, {title}]")
+                print(f"......Downloading the paper PDF")
+                filename = download_pdf_from_arxiv(arxiv_id)
+                print(f"......DONE")
+                print(f"......Extracting text and figures")
+                texts, figures = extract_text_and_figures(filename)
+                text =' '.join(texts)
+                print(f"......DONE")
+                print(f"......Generating the seed(basic) QAs")
+                qnas = get_basic_qa(text, gemini_api_key=gemini_api, trucate=30000)
+                qnas['title'] = title
+                qnas['abstract'] = abstract
+                qnas['authors'] = ','.join(authors)
+                qnas['arxiv_id'] = arxiv_id
+                qnas['target_date'] = target_date
+                qnas['full_text'] = text
+                print(f"......DONE")
+                print(f"......Generating the follow-up QAs")
+                qnas = get_deep_qa(text, qnas, gemini_api_key=gemini_api, trucate=30000)
+                del qnas["qna"]
+                print(f"......DONE")
+                print(f"......Exporting to HF Dataset repo at [{hf_repo_id}]")
+                df = pd.DataFrame([qnas])
+                ds = datasets.Dataset.from_pandas(df)
+                ds = ds.cast_column("target_date", datasets.features.Value("timestamp[s]"))
+                push_to_hf_hub(ds, hf_repo_id, hf_token)
+                print(f"......DONE")
+                print(f"......Updating request arXiv HF Dataset repo at [{req_hf_repo_id}]")
+                ds1 = ds1['train'].map(
+                    lambda example: _filter_function(example, [arxiv_id])
+                ).filter(
+                    lambda example: len(example['Requested arXiv IDs']) > 0
+                )
+                ds1.push_to_hub(req_hf_repo_id, token=hf_token)
+                print(f"......DONE")
+            except Exception as e:
+                print(f".......failed due to exception {e}")
+                continue
+        HfApi(token=hf_token).restart_space(
+            repo_id=restart_repo_id, token=hf_token
+        )

constants/context.py ADDED Viewed

	@@ -0,0 +1,13 @@

+DEFAULT_GLOBAL_CTX = """
+You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.
+Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
+Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
+If you don't know the answer to a question, please don't share false information.
+Based on the above statement, answer questions based on the text below.
+------------------------------------------------------------------------
+%s
+"""
+placeholder = "In each conversation, question is placed after [INST] while your answer should be placed after [/INST]. By looking [INST] and [/INST], you must consider multi-turn conversations."

constants/js.py CHANGED Viewed

@@ -83,14 +83,113 @@ function search(searchIn, maxResults = 3) {{
 }}
 """
-UPDATE_IF_TYPE = f"""
-function chage_if_type(if_type) {{
-    if (if_type == 'Q&As') {{
-        document.getElementById('chat_block').style.display = 'none';
-        document.getElementById('qna_block').style.display = 'block';
-    }} else {{
-        document.getElementById('chat_block').style.display = 'block';
-        document.getElementById('qna_block').style.display = 'none';
-    }}
-}}
 """

 }}
 """
+UPDATE_IF_TYPE = """
+function chage_if_type() {
+    document.querySelector("#chatbot-back").style.display = 'block';
+    document.getElementById('qna_block').style.display = 'none';
+}
+"""
+#   globalThis.setStorage = (key, value)=>{
+#     localStorage.setItem(key, JSON.stringify(value));
+#   }
+#   globalThis.getStorage = (key, value)=>{
+#     return JSON.parse(localStorage.getItem(key));
+#   }
+OPEN_CHAT_IF = """
+function (arXivId) {
+    var localData = localStorage.getItem('localData');
+    if (!localData) {
+      localData = {}; // Initialize if it doesn't exist
+    }
+    else {
+      localData = JSON.parse(localData);
+    }
+    if (!localData[arXivId]) {
+      localData[arXivId] = { ctx: '', pingpongs: [] };
+    }
+    localStorage.setItem('localData', JSON.stringify(localData));
+    document.querySelector("#chatbot-back").classList.add("visible");
+    pingpongs = [];
+    localData[arXivId]['pingpongs'].forEach(element =>{
+      pingpongs.push([element.ping, element.pong]);
+    });
+    return [localData[arXivId], pingpongs];
+}
+"""
+CLOSE_CHAT_IF = """
+function close() {
+    setTimeout(function() {
+    document.querySelector("#chatbot-back").classList.remove("visible"); // Remove after a slight delay
+    }, 100); // 100-millisecond delay
+}
+"""
+UPDATE_CHAT_HISTORY = """
+function (arXivId, data) {
+    console.log(arXivId)
+    console.log(data);
+    if (localStorage.getItem('localData') === null) {
+        localStorage['localData'] = {};
+    }
+    var localData = localStorage.getItem('localData');
+    localData = JSON.parse(localData);
+    localData[arXivId] = data;
+    console.log(localData[arXivId]);
+    localStorage.setItem('localData', JSON.stringify(localData));
+}
+"""
+GET_LOCAL_STORAGE = """
+function() {
+  globalThis.setStorage = (arXivId, value) => {
+    console.log(value);
+    if (localStorage.getItem('localData') === null) {
+        localStorage['localData'] = {};
+    }
+    var localData = localStorage.getItem('localData');
+    localData = JSON.parse(localData);
+    localData[arXivId] = value;
+    console.log(localData[arXivId]);
+    localStorage.setItem('localData', JSON.stringify(localData));
+  }
+  globalThis.getStorage = (arXivId)=>{
+    var localData = localStorage.getItem('localData');
+    console.log(localData);
+    if (!localData) {
+      localData = {}; // Initialize if it doesn't exist
+    }
+    else {
+      localData = JSON.parse(localData);
+    }
+    if (!localData[arXivId]) {
+      localData[arXivId] = { ctx: '', pingpongs: [] };
+    }
+    localStorage.setItem('localData', JSON.stringify(localData));
+    console.log(localData[arXivId]['pingpongs']);
+    return [localData[arXivId], localData[arXivId]['pingpongs']];
+  }
+  var localData = localStorage.getItem('localData');
+  if(!localData) {
+    localData = {}
+    localStorage.setItem('localData', JSON.stringify(localData));
+  }
+  return [localData['%s']['pingpongs'], localData];
+}
 """

constants/styles.py CHANGED Viewed

@@ -1,7 +1,7 @@
 STYLE = """
-@media only screen and (min-width: 700px) {
-    .main {
         width: 70% !important;
         margin: 0 auto; /* Center the container */
     }
@@ -76,4 +76,99 @@ h3 {
 #control-panel {
     margin-bottom: 30px;
 }
 """

 STYLE = """
+.main {
+    @media only screen and (min-width: 1000px) {
         width: 70% !important;
         margin: 0 auto; /* Center the container */
     }
 #control-panel {
     margin-bottom: 30px;
 }
+#chatbot {
+    background-color: white;
+    border: 1px solid #ccc;
+    padding: 20px;
+    box-shadow: 0px 5px 5px rgba(0, 0, 0, 0.3);
+    border-radius: 30px;
+    height: 80%;
+    width: 80%;
+    position: fixed;
+    top: 50%;
+    left: 50%;
+    transform: translate(-50%, -50%);
+    z-index: 1000; /* Or a high enough value to stay on top */
+    @media (max-width: 768px) { /* Adjust this breakpoint as needed */
+        width: 95%;
+    }
+    @media (prefers-color-scheme: dark) {
+        background-color: dimgrey;
+    }
+}
+#chat-button {
+    border-radius: 40px;
+    padding: 0px;
+    margin: 0px;
+    margin-left: 30px;
+    margin-right: 30px;
+    font-size: 13pt !important;
+    @media only screen and (min-width: 500px) {
+        font-size: 10pt;
+        margin: 0 auto; /* Center the container */
+    }
+}
+#chatbot-inside {
+    height: 100% !important;
+    border-width: 1px !important;
+    border-color: lightgray !important;
+}
+#chatbot-txtbox {
+    padding-bottom: 25px;
+}
+#chatbot-bottm {
+    padding-left: 10px;
+    padding-right: 10px;
+}
+#chatbot-right-button {
+    float: right;
+    width: 20px;
+    font-size: 17pt;
+}
+#chatbot-info {
+    word-break: break-word;
+}
+#chatbot-back {
+    position: absolute; /* Stay in place even when scrolling */
+    z-index: 1000; /* Ensure it's on top of everything else */
+    width: 100%;
+    height: 100%;
+    left: 0px;
+    top: 0px;
+    opacity: 0;
+    visibility: hidden; /* Ensures the element is not interactive */
+    transition: opacity 0.5s ease, visibility 0s 0.5s; /* Transition for opacity and delay visibility */
+}
+#chatbot-back.visible {
+    opacity: 1;
+    visibility: visible; /* Now visible and interactive */
+    transition: opacity 0.5s ease; /* Smooth transition for opacity */
+}
+.hover-opacity {
+  opacity: 0.8;  /* Normal opacity of the element */
+  transition: opacity 0.3s ease-in-out;  /* Smooth opacity change */
+}
+.hover-opacity:hover {
+  opacity: 1;  /* Full opacity on hover */
+}
+.markdown-center {
+    text-align: -webkit-center;
+}
 """

gen/gemini.py CHANGED Viewed

@@ -69,7 +69,7 @@ def call_gemini(prompt="", API_KEY=None, given_text=None, given_image=None, gene
     response = model.generate_content(prompt_parts)
     return response.text
-def try_out(prompt, given_text, gemini_api_key, given_image=None, retry_num=5):
     qna_json = None
     cur_retry = 0

     response = model.generate_content(prompt_parts)
     return response.text
+def try_out(prompt, given_text, gemini_api_key, given_image=None, retry_num=10):
     qna_json = None
     cur_retry = 0

gen/gemini_chat.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import copy
+import asyncio
+import google.generativeai as genai
+from pingpong import PingPong
+from pingpong.pingpong import PPManager
+from pingpong.pingpong import PromptFmt
+from pingpong.pingpong import UIFmt
+from pingpong.gradio import GradioChatUIFmt
+class GeminiChatPromptFmt(PromptFmt):
+    @classmethod
+    def ctx(cls, context):
+        if context is None or context == "":
+            return None
+        else:
+            return  {
+                "role": "system",
+                "parts": [context]
+            }
+    @classmethod
+    def prompt(cls, pingpong, truncate_size):
+        ping = pingpong.ping[:truncate_size]
+        pong = "" if pingpong.pong is None else pingpong.pong[:truncate_size]
+        result = [
+            {
+                "role": "user",
+                "parts": [ping]
+            }
+        ]
+        if pong != "":
+            result = result + [
+                {
+                    "role": "model",
+                    "parts": [pong]
+                }
+            ]
+        return result
+class GeminiChatPPManager(PPManager):
+    def build_prompts(self, from_idx: int=0, to_idx: int=-1, fmt: PromptFmt=GeminiChatPromptFmt, truncate_size: int=None):
+        if to_idx == -1 or to_idx >= len(self.pingpongs):
+            to_idx = len(self.pingpongs)
+        pingpongs = copy.deepcopy(self.pingpongs)
+        ctx = fmt.ctx(self.ctx)
+        ctx = ctx['parts'][0] if ctx is not None else ""
+        results = []
+        for idx, pingpong in enumerate(pingpongs[from_idx:to_idx]):
+            if idx == 0:
+                pingpong.ping = f"SYSTEM: {ctx} ----------- \n" + pingpong.ping
+            results += fmt.prompt(pingpong, truncate_size=truncate_size)
+        return results
+class GradioGeminiChatPPManager(GeminiChatPPManager):
+    def build_uis(self, from_idx: int=0, to_idx: int=-1, fmt: UIFmt=GradioChatUIFmt):
+        if to_idx == -1 or to_idx >= len(self.pingpongs):
+            to_idx = len(self.pingpongs)
+        results = []
+        for pingpong in self.pingpongs[from_idx:to_idx]:
+            results.append(fmt.ui(pingpong))
+        return results
+def init(api_key):
+    genai.configure(api_key=api_key)
+def _default_gen_text():
+    return {
+        "temperature": 0.9,
+        "top_p": 1,
+        "top_k": 1,
+        "max_output_tokens": 2048,
+    }
+def _default_safety_settings():
+    return [
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_MEDIUM_AND_ABOVE"
+        },
+    ]
+async def _word_generator(sentence):
+    for word in sentence.split():
+        yield word
+        delay = 0.03 + (len(word) * 0.005)
+        await asyncio.sleep(delay)  # Simulate a short delay
+async def gen_text(
+    prompts,
+    gen_config=_default_gen_text(),
+    safety_settings=_default_safety_settings(),
+    stream=True
+):
+    model = genai.GenerativeModel(model_name="gemini-1.0-pro",
+                                generation_config=gen_config,
+                                safety_settings=safety_settings)
+    user_prompt = prompts[-1]
+    prompts = prompts[:-1]
+    convo = model.start_chat(history=prompts)
+    resps = await convo.send_message_async(
+        user_prompt["parts"][0], stream=stream
+    )
+    async for resp in resps:
+        async for word in _word_generator(resp.text):
+            yield word + " "

gen/openllm.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import json
+import requests
+import sseclient
+from pingpong import PingPong
+from pingpong.pingpong import PPManager
+from pingpong.pingpong import PromptFmt
+from pingpong.pingpong import UIFmt
+from pingpong.gradio import GradioChatUIFmt
+class MistralChatPromptFmt(PromptFmt):
+    @classmethod
+    def ctx(cls, context):
+        if context is None or context == "":
+            return ""
+        else:
+            return f"""{context}
+"""
+    @classmethod
+    def prompt(cls, pingpong, truncate_size):
+        ping = pingpong.ping[:truncate_size]
+        pong = "" if pingpong.pong is None else pingpong.pong[:truncate_size] + "</s>"
+        return f"""<s>[INST] {ping} [/INST] {pong}
+"""
+class MistralChatPPManager(PPManager):
+    def build_prompts(self, from_idx: int=0, to_idx: int=-1, fmt: PromptFmt=MistralChatPromptFmt, truncate_size: int=None):
+        if to_idx == -1 or to_idx >= len(self.pingpongs):
+            to_idx = len(self.pingpongs)
+        results = fmt.ctx(self.ctx)
+        for idx, pingpong in enumerate(self.pingpongs[from_idx:to_idx]):
+            results += fmt.prompt(pingpong, truncate_size=truncate_size)
+        return results
+class GradioMistralChatPPManager(MistralChatPPManager):
+    def build_uis(self, from_idx: int=0, to_idx: int=-1, fmt: UIFmt=GradioChatUIFmt):
+        if to_idx == -1 or to_idx >= len(self.pingpongs):
+            to_idx = len(self.pingpongs)
+        results = []
+        for pingpong in self.pingpongs[from_idx:to_idx]:
+            results.append(fmt.ui(pingpong))
+        return results
+class LLaMA2ChatPromptFmt(PromptFmt):
+    @classmethod
+    def ctx(cls, context):
+        if context is None or context == "":
+            return ""
+        else:
+            return f"""<<SYS>>
+{context}
+<</SYS>>
+"""
+    @classmethod
+    def prompt(cls, pingpong, truncate_size):
+        ping = pingpong.ping[:truncate_size]
+        pong = "" if pingpong.pong is None else pingpong.pong[:truncate_size]
+        return f"""[INST] {ping} [/INST] {pong}"""
+class LLaMA2ChatPPManager(PPManager):
+    def build_prompts(self, from_idx: int=0, to_idx: int=-1, fmt: PromptFmt=LLaMA2ChatPromptFmt, truncate_size: int=None):
+        if to_idx == -1 or to_idx >= len(self.pingpongs):
+            to_idx = len(self.pingpongs)
+        results = fmt.ctx(self.ctx)
+        for idx, pingpong in enumerate(self.pingpongs[from_idx:to_idx]):
+            results += fmt.prompt(pingpong, truncate_size=truncate_size)
+        return results
+class GradioLLaMA2ChatPPManager(LLaMA2ChatPPManager):
+    def build_uis(self, from_idx: int=0, to_idx: int=-1, fmt: UIFmt=GradioChatUIFmt):
+        if to_idx == -1 or to_idx >= len(self.pingpongs):
+            to_idx = len(self.pingpongs)
+        results = []
+        for pingpong in self.pingpongs[from_idx:to_idx]:
+            results.append(fmt.ui(pingpong))
+        return results
+async def gen_text(
+    prompt,
+    hf_model='mistralai/Mistral-7B-Instruct-v0.2', # 'mistralai/Mixtral-8x7B-Instruct-v0.1', # 'mistralai/Mistral-7B-Instruct-v0.1', # 'meta-llama/Llama-2-70b-chat-hf',
+    hf_token=None,
+    parameters=None
+):
+  if hf_token is None:
+    raise ValueError("Hugging Face Token is not set")
+  if parameters is None:
+    parameters = {
+        'max_new_tokens': 512,
+        'do_sample': True,
+        'return_full_text': False,
+        'temperature': 1.0,
+        'top_k': 50,
+        # 'top_p': 1.0,
+        'repetition_penalty': 1.2
+    }
+  url = f'https://api-inference.huggingface.co/models/{hf_model}'
+  headers={
+      'Authorization': f'Bearer {hf_token}',
+      'Content-type': 'application/json'
+  }
+  data = {
+      'inputs': prompt,
+      'stream': True,
+      'options': {
+          'use_cache': False,
+      },
+      'parameters': parameters
+  }
+  r = requests.post(
+      url,
+      headers=headers,
+      data=json.dumps(data),
+      stream=True
+  )
+  try:
+    client = sseclient.SSEClient(r)
+    for event in client.events():
+        yield json.loads(event.data)['token']['text']
+  except Exception as e:
+      print(e)
+def gen_text_none_stream(
+    prompt,
+    hf_model='meta-llama/Llama-2-70b-chat-hf',
+    hf_token=None,
+):
+    parameters = {
+        'max_new_tokens': 64,
+        'do_sample': True,
+        'return_full_text': False,
+        'temperature': 0.7,
+        'top_k': 10,
+        # 'top_p': 1.0,
+        'repetition_penalty': 1.2
+    }
+    url = f'https://api-inference.huggingface.co/models/{hf_model}'
+    headers={
+        'Authorization': f'Bearer {hf_token}',
+        'Content-type': 'application/json'
+    }
+    data = {
+        'inputs': prompt,
+        'stream': False,
+        'options': {
+            'use_cache': False,
+        },
+        'parameters': parameters
+    }
+    r = requests.post(
+        url,
+        headers=headers,
+        data=json.dumps(data),
+    )
+    return json.loads(r.text)[0]["generated_text"]

init.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import copy
+import datasets
+import pandas as pd
+from collections import defaultdict
+from datetime import datetime, timedelta
+from background import process_arxiv_ids
+from apscheduler.schedulers.background import BackgroundScheduler
+def _count_nans(row):
+    count = 0
+    for _, (k, v) in enumerate(row.items()):
+        if v is None:
+            count = count + 1
+    return count
+def _initialize_requested_arxiv_ids(request_ds):
+    requested_arxiv_ids = []
+    for request_d in request_ds['train']:
+        arxiv_ids = request_d['Requested arXiv IDs']
+        requested_arxiv_ids = requested_arxiv_ids + arxiv_ids
+    requested_arxiv_ids_df = pd.DataFrame({'Requested arXiv IDs': requested_arxiv_ids})
+    return requested_arxiv_ids_df
+def _initialize_paper_info(source_ds):
+    title2qna, date2qna = {}, {}
+    date_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
+    arxivid2data = {}
+    count = 0
+    for data in source_ds["train"]:
+        date = data["target_date"].strftime("%Y-%m-%d")
+        arxiv_id = data["arxiv_id"]
+        if date in date2qna:
+            papers = copy.deepcopy(date2qna[date])
+            for paper in papers:
+                if paper["title"] == data["title"]:
+                    if _count_nans(paper) > _count_nans(data):
+                        date2qna[date].remove(paper)
+            date2qna[date].append(data)
+            del papers
+        else:
+            date2qna[date] = [data]
+    for date in date2qna:
+        year, month, day = date.split("-")
+        papers = date2qna[date]
+        for paper in papers:
+            title2qna[paper["title"]] = paper
+            arxivid2data[paper['arxiv_id']] = {"idx": count, "paper": paper}
+            date_dict[year][month][day].append(paper)
+    titles = title2qna.keys()
+    return titles, date_dict, arxivid2data
+def initialize_data(source_data_repo_id, request_data_repo_id):
+    global date_dict, arxivid2data
+    global requested_arxiv_ids_df
+    source_ds = datasets.load_dataset(source_data_repo_id)
+    request_ds = datasets.load_dataset(request_data_repo_id)
+    titles, date_dict, arxivid2data = _initialize_paper_info(source_ds)
+    requested_arxiv_ids_df = _initialize_requested_arxiv_ids(request_ds)
+    return (
+        titles, date_dict, requested_arxiv_ids_df, arxivid2data
+    )
+def update_dataframe(request_data_repo_id):
+    request_ds = datasets.load_dataset(request_data_repo_id)
+    return _initialize_requested_arxiv_ids(request_ds)
+def get_secrets():
+    global gemini_api_key
+    global hf_token
+    global request_arxiv_repo_id
+    global dataset_repo_id
+    gemini_api_key = os.getenv("GEMINI_API_KEY")
+    hf_token = os.getenv("HF_TOKEN")
+    dataset_repo_id = os.getenv("SOURCE_DATA_REPO_ID")
+    request_arxiv_repo_id = os.getenv("REQUEST_DATA_REPO_ID")
+    restart_repo_id = os.getenv("RESTART_TARGET_SPACE_REPO_ID", "chansung/paper_qa")
+    return (
+        gemini_api_key,
+        hf_token,
+        dataset_repo_id,
+        request_arxiv_repo_id,
+        restart_repo_id
+    )

requirements.txt CHANGED Viewed

@@ -1,3 +1,7 @@
 google-generativeai
 pypdf2
 PyMuPDF
@@ -6,4 +10,4 @@ requests
 toml
 datasets
 flatdict
-APScheduler

+bingbong
+sseclient-py
+chromadb
+pydantic-settings
 google-generativeai
 pypdf2
 PyMuPDF
 toml
 datasets
 flatdict
+APScheduler

ui.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import re
+import copy
+import json
+import datasets
+import gradio as gr
+import pandas as pd
+from pingpong import PingPong
+from pingpong.context import CtxLastWindowStrategy
+from gen.openllm import gen_text as open_llm_gen_text
+from gen.gemini_chat import gen_text as gemini_gen_text
+from gen.gemini_chat import init as gemini_init
+from constants.context import DEFAULT_GLOBAL_CTX
+from init import (
+    requested_arxiv_ids_df,
+    date_dict,
+    arxivid2data,
+    request_arxiv_repo_id,
+    hf_token,
+    gemini_api_key
+)
+from utils import push_to_hf_hub
+def get_paper_by_year(year):
+    months = sorted(date_dict[year].keys())
+    last_month = months[-1]
+    days = sorted(date_dict[year][last_month].keys())
+    last_day = days[-1]
+    papers = list(set(
+        [paper["title"] for paper in date_dict[year][last_month][last_day]]
+    ))
+    return (
+        gr.Dropdown(choices=months, value=last_month),
+        gr.Dropdown(choices=days, value=last_day),
+        gr.Dropdown(choices=papers, value=papers[0])
+    )
+def get_paper_by_month(year, month):
+    days = sorted(date_dict[year][month].keys())
+    last_day = days[-1]
+    papers = list(set(
+        [paper["title"] for paper in date_dict[year][month][last_day]]
+    ))
+    return (
+        gr.Dropdown(choices=days, value=last_day),
+        gr.Dropdown(choices=papers, value=papers[0])
+    )
+def get_paper_by_day(year, month, day):
+    papers = list(set(
+        [paper["title"] for paper in date_dict[year][month][day]]
+    ))
+    return gr.Dropdown(choices=papers, value=papers[0])
+def set_papers(year, month, day, title):
+    papers = []
+    for paper in date_dict[year][month][day]:
+        papers.append(paper["title"])
+        if paper["title"] == title:
+            arxiv_id = paper["arxiv_id"]
+    papers = list(set(papers))
+    return (
+        arxiv_id,
+        gr.Dropdown(choices=papers, value=title),
+        gr.Textbox("")
+    )
+def set_paper(year, month, day, paper_title):
+    selected_paper = None
+    for paper in date_dict[year][month][day]:
+        if paper["title"] == paper_title:
+            selected_paper = paper
+            break
+    print(type(selected_paper['arxiv_id']))
+    return (
+        selected_paper['arxiv_id'],
+        gr.Markdown(f"# {selected_paper['title']}"),
+        gr.Markdown(
+            "[![arXiv](https://img.shields.io/badge/arXiv-%s-b31b1b.svg?style=for-the-badge)](https://arxiv.org/abs/%s)" % (selected_paper['arxiv_id'], selected_paper['arxiv_id'])
+        ),
+        gr.Markdown(
+            "[![Paper page](https://huggingface.co/datasets/huggingface/badges/resolve/main/paper-page-lg.svg)](https://huggingface.co/papers/%s)" % selected_paper['arxiv_id']
+        ),
+        gr.Markdown(selected_paper["summary"]),
+        gr.Markdown(f"### 🙋 {selected_paper['0_question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['0_answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_depth_q:follow up question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['0_additional_breath_q:follow up question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋 {selected_paper['1_question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['1_answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_depth_q:follow up question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['1_additional_breath_q:follow up question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋 {selected_paper['2_question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['2_answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_depth_q:follow up question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"),
+        gr.Markdown(f"### 🙋🙋 {selected_paper['2_additional_breath_q:follow up question']}"),
+        gr.Markdown(f"↪ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"),
+        gr.Markdown(f"↪ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"),
+    )
+def set_date(title):
+    for _, (year, months) in enumerate(date_dict.items()):
+        for _, (month, days) in enumerate(months.items()):
+            for _, (day, papers) in enumerate(days.items()):
+                for paper in papers:
+                    if paper['title'] == title:
+                        return (
+                            gr.Dropdown(value=year),
+                            gr.Dropdown(choices=sorted(months), value=month),
+                            gr.Dropdown(choices=sorted(days), value=day),
+                        )
+def change_exp_type(exp_type):
+    if exp_type == "ELI5":
+        return (
+            gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False),
+            gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False),
+            gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False),
+        )
+    else:
+        return (
+            gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True),
+            gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True),
+            gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True), gr.Markdown(visible=False), gr.Markdown(visible=True),
+        )
+def _filter_duplicate_arxiv_ids(arxiv_ids_to_be_added):
+    ds1 = datasets.load_dataset("chansung/requested-arxiv-ids-3")
+    ds2 = datasets.load_dataset("chansung/auto-paper-qa2")
+    unique_arxiv_ids = set()
+    for d in ds1['train']:
+        arxiv_ids = d['Requested arXiv IDs']
+        unique_arxiv_ids = set(list(unique_arxiv_ids) + arxiv_ids)
+    for d in ds2['train']:
+        arxiv_id = d['arxiv_id']
+        unique_arxiv_ids.add(arxiv_id)
+    return list(set(arxiv_ids_to_be_added) - unique_arxiv_ids)
+def _is_arxiv_id_valid(arxiv_id):
+  pattern = r"^\d{4}\.\d{5}$"
+  return bool(re.match(pattern, arxiv_id))
+def _get_valid_arxiv_ids(arxiv_ids_str):
+    valid_arxiv_ids = []
+    invalid_arxiv_ids = []
+    for arxiv_id in arxiv_ids_str.split(","):
+        arxiv_id = arxiv_id.strip()
+        if _is_arxiv_id_valid(arxiv_id):
+           valid_arxiv_ids.append(arxiv_id)
+        else:
+            invalid_arxiv_ids.append(arxiv_id)
+    return valid_arxiv_ids, invalid_arxiv_ids
+def add_arxiv_ids_to_queue(queue, arxiv_ids_str):
+    valid_arxiv_ids, invalid_arxiv_ids = _get_valid_arxiv_ids(arxiv_ids_str)
+    if len(invalid_arxiv_ids) > 0:
+        gr.Warning(f"found invalid arXiv ids as in {invalid_arxiv_ids}")
+    if len(valid_arxiv_ids) > 0:
+        valid_arxiv_ids = _filter_duplicate_arxiv_ids(valid_arxiv_ids)
+        if len(valid_arxiv_ids) > 0:
+            valid_arxiv_ids = [[arxiv_id] for arxiv_id in valid_arxiv_ids]
+            gr.Warning(f"Processing on [{valid_arxiv_ids}]. Other requested arXiv IDs not found on this list should be already processed or being processed...")
+            valid_arxiv_ids = pd.DataFrame({'Requested arXiv IDs': valid_arxiv_ids})
+            queue = pd.concat([queue, valid_arxiv_ids])
+            queue.reset_index(drop=True)
+            ds = datasets.Dataset.from_pandas(valid_arxiv_ids)
+            push_to_hf_hub(ds, request_arxiv_repo_id, hf_token)
+        else:
+            gr.Warning(f"All requested arXiv IDs are already processed or being processed...")
+    else:
+        gr.Warning(f"No valid arXiv IDs found...")
+    return (
+        queue, gr.Textbox("")
+    )
+# Chat
+def before_chat_begin():
+    return (
+        gr.Button(interactive=False),
+        gr.Button(interactive=False),
+        gr.Button(interactive=False)
+    )
+def _build_prompts(ppmanager, global_context, win_size=3):
+    dummy_ppm = copy.deepcopy(ppmanager)
+    dummy_ppm.ctx = global_context
+    lws = CtxLastWindowStrategy(win_size)
+    return lws(dummy_ppm)
+async def chat_stream(idx, local_data, user_prompt, chat_state, ctx_num_lconv=3):
+    paper = arxivid2data[idx]['paper']
+    ppm = chat_state["ppmanager_type"].from_json(json.dumps(local_data))
+    ppm.add_pingpong(
+        PingPong(
+            user_prompt,
+            ""
+        )
+    )
+    prompt = _build_prompts(ppm, DEFAULT_GLOBAL_CTX % paper["full_text"].replace("\n", " ")[:30000], ctx_num_lconv)
+    print(prompt)
+    # async for result in open_llm_gen_text(
+    #     prompt,
+    #     hf_model='meta-llama/Llama-2-70b-chat-hf', hf_token=hf_token,
+    #     parameters={
+    #         'max_new_tokens': 4906,
+    #         'do_sample': True,
+    #         'return_full_text': False,
+    #         'temperature': 0.7,
+    #         'top_k': 10,
+    #         'repetition_penalty': 1.2
+    #     }
+    # ):
+    gemini_init(gemini_api_key)
+    async for result in gemini_gen_text(prompt):
+        ppm.append_pong(result)
+        yield "", ppm.build_uis(), str(ppm), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
+    yield "", ppm.build_uis(), str(ppm), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
+def chat_reset(local_data, chat_state):
+    ppm = chat_state["ppmanager_type"].from_json(json.dumps(local_data))
+    ppm.pingpongs = []
+    return "", ppm.build_uis(), str(ppm), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)

utils.py CHANGED Viewed

@@ -1,28 +1,21 @@
-import pandas as pd
 import datasets
-from datasets import Dataset
 from huggingface_hub import create_repo
 from huggingface_hub.utils import HfHubHTTPError
 def push_to_hf_hub(
-	qnas, repo_id, token, append=True
 ):
-    print(1)
     exist = False
-    df = pd.DataFrame([qnas])
-    ds = Dataset.from_pandas(df)
-    ds = ds.cast_column("target_date", datasets.features.Value("timestamp[s]"))
-    print(2)
     try:
-        create_repo(repo_id, repo_type="dataset", token=token)
     except HfHubHTTPError as e:
         exist = True
     if exist and append:
-        print(3)
         existing_ds = datasets.load_dataset(repo_id)
         ds = datasets.concatenate_datasets([existing_ds['train'], ds])
-    print(4)
-    ds.push_to_hub(repo_id, token=token)

 import datasets
+import pandas as pd
 from huggingface_hub import create_repo
 from huggingface_hub.utils import HfHubHTTPError
 def push_to_hf_hub(
+	ds, repo_id, hf_token, append=True
 ):
     exist = False
     try:
+        create_repo(repo_id, repo_type="dataset", token=hf_token)
     except HfHubHTTPError as e:
         exist = True
     if exist and append:
         existing_ds = datasets.load_dataset(repo_id)
         ds = datasets.concatenate_datasets([existing_ds['train'], ds])
+    ds.push_to_hub(repo_id, token=hf_token)