Spaces:

open-acc
/

Summarize-Arxiv-Papers-and-ChatBot

Running

App Files Files Community

Alanturner2 commited on 4 days ago

Commit

7c1cc3d

verified ·

1 Parent(s): d852a14

Create app.py

Browse files

Files changed (1) hide show

app.py +247 -0

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import gradio as gr
+from langchain.document_loaders import ArxivLoader
+from PyPDF2 import PdfReader
+from langchain_community.llms import HuggingFaceHub
+from langchain.text_splitter import TokenTextSplitter
+from langchain.chains.summarize import load_summarize_chain
+from langchain.document_loaders import PyPDFLoader
+from transformers import pipeline
+from dotenv import load_dotenv
+import os
+load_dotenv()
+hugging_api_key = os.getenv('HUGGING_API_KEY')
+from groq import AsyncGroq
+from groq import Groq
+from langchain_groq import ChatGroq
+from langchain.document_loaders import ArxivLoader
+from langchain.vectorstores import Chroma
+from langchain.chains import RetrievalQA
+from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
+from huggingface_hub import login
+login(hugging_api_key)
+embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=hugging_api_key)
+llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key = "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")
+def display_results(result):
+    return "\n".join(result)  # Join each entry with double newlines for better readability
+def summarize_pdf(pdf_file_path, max_length):
+    # summarizer = pipeline('summarization', model='allenai/led-large-16384-arxiv', min_length=100, max_length=max_length, device=0)
+    loader = PdfReader(pdf_file_path)
+    text = """ """
+    for page in loader.pages:
+        text += page.extract_text()
+    text_splitter = TokenTextSplitter(chunk_size=8192, chunk_overlap=1000)
+    chunks = text_splitter.split_text(text)
+    summary = ""
+    for i in range(len(chunks)):
+        # text = chunks[i].page_content
+        text = chunks[i]
+        summary += summarize_text(text)
+    # summary = str(max_length)
+    return summary
+def summarize_text(text):
+    sum_client = Groq(api_key="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")
+    messages = []
+    # messages.append({"role": "system", "content": "You are arxiv paper summarizer. If I give you the doi number, you should only output summarization. Summarization should be more than 10% words of the paper. For example, in the paper there are 500 words, than summarization should be more than 50 words."})
+    messages.append({"role": "system", "content": "You are summarizer. If I give you the whole text you should summarize it.  And you don't need the title and author"})
+    messages = messages + [
+        {
+            "role": "user",
+            "content": f"Summarize the paper. The whole text is {text}",
+        },
+    ]
+    response = sum_client.chat.completions.create(
+        messages=messages,
+        model="llama3-70b-8192",
+        temperature=0,
+        max_tokens=8192,
+        top_p=1,
+        stop=None
+    )
+    text_summary = response.choices[0].message.content
+    return text_summary
+def remove_first_sentence_and_title(text):
+    # Remove the first sentence
+    first_sentence_end = text.find('. ') + 2  # Find the end of the first sentence
+    text_without_first_sentence = text[first_sentence_end:]
+    # Remove the title
+    title_start = text_without_first_sentence.find('**Title:**')
+    if title_start != -1:
+        title_end = text_without_first_sentence.find('\n', title_start)
+        if title_end != -1:
+            text_without_title = text_without_first_sentence[:title_start] + text_without_first_sentence[title_end+1:]
+        else:
+            text_without_title = text_without_first_sentence[:title_start]
+    else:
+        text_without_title = text_without_first_sentence
+    return text_without_title.strip()
+def summarize_arxiv_pdf(query):
+    loader = ArxivLoader(query=query, load_max_docs=10)
+    documents = loader.load()
+    text_splitter = TokenTextSplitter(chunk_size=5700, chunk_overlap=100)
+    chunks = text_splitter.split_documents(documents)
+    text = documents[0].page_content
+    ref_summary = ""
+    for i in range(len(chunks)):
+        text = chunks[i].page_content
+        ref_summary += summarize_text(text)
+    # ref_summary = ref_summary.split('paper:')[1]
+    # ref_summary = remove_first_sentence_and_title(ref_summary)
+    ref_summary = ref_summary.replace("Here is a summary of the paper:", "").strip()
+    arxiv_summary = loader.get_summaries_as_docs()
+    summaries = []
+    for doc in arxiv_summary:
+        title = doc.metadata.get("Title")
+        authors = doc.metadata.get("Authors")
+        url = doc.metadata.get("Entry ID")
+        summary = doc.page_content
+        summaries.append(f"**{title}**\n")
+        summaries.append(f"**Authors:** {authors}\n")
+        summaries.append(f"**View full paper:** [Link to paper]({url})\n")
+        summaries.append(f"**Summary:** {summary}\n")
+        summaries.append(f"**Lazyman Summary:**\n ")
+        summaries.append(f"{ref_summary}")
+    summaries = display_results(summaries)
+    print(summaries)
+    return summaries
+client = AsyncGroq(api_key="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")
+async def chat_with_replit(message, history):
+    messages = []
+    for chat in history:
+        user = str(chat[0])
+        assistant = str(chat[1])
+        messages.append({"role": "system", "content": "You are assistor. I will ask you some questions than you should answer!"})
+        messages.append({"role": 'user', "content": user})
+        messages.append({"role": 'assistant', "content": assistant})
+    messages = messages + [
+        {
+            "role": "user",
+            "content": str(message),
+        },
+    ]
+    print(messages)
+    response_content = ""
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model="llama3-70b-8192",
+        temperature=0,
+        max_tokens=1024,
+        top_p=1,
+        stop=None,
+        stream=True,
+    )
+    async for chunk in stream:
+        content = chunk.choices[0].delta.content
+        if content:
+            response_content += chunk.choices[0].delta.content
+        yield response_content
+js = """<script src="https://replit.com/public/js/replit-badge-v2.js" theme="dark" position="bottom-right"></script>"""
+async def chat_with_replit_pdf(message, history, doi_num):
+    messages = []
+    old_doi = "old"
+    if old_doi != doi_num:
+        loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
+        documents = loader.load_and_split()
+        metadata = documents[0].metadata
+        vector_store = Chroma.from_documents(documents, embedding_model)
+        old_doi = doi_num
+    def retrieve_relevant_content(user_query):
+        results = vector_store.similarity_search(user_query, k=3)
+        relevant_content = "\n\n".join([doc.page_content for doc in results])
+        return relevant_content
+    relevant_content = retrieve_relevant_content(message)
+    messages = messages + [
+        {
+            "role": "user",
+            "content": str(message),
+        },
+        {
+            "role": "system",
+            "content": f"You should answer about this arxiv paper for {doi_num}.\n"
+            f"This is the metadata of the paper:{metadata}.\n"
+            f"This is relevant information of the paper:{relevant_content}.\n"
+        }
+    ]
+    print(messages)
+    response_content = ""
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model="llama3-70b-8192",
+        temperature=0,
+        max_tokens=1024,
+        top_p=1,
+        stop=None,
+        stream=False,
+    )
+    return stream.choices[0].message.content;
+with gr.Blocks() as app:
+    with gr.Tab(label="Arxiv summarization"):
+        with gr.Column():
+            number = gr.Textbox(label="Enter your arxiv number")
+            sumarxiv_btn = gr.Button(value="summarize-arxiv")
+        with gr.Column():
+            outputs = gr.Markdown(label="Summary", height=1000)
+    sumarxiv_btn.click(summarize_arxiv_pdf, inputs=number, outputs=outputs)
+    with gr.Tab(label="Local summarization"):
+        with gr.Row():
+            with gr.Column():
+                input_path = gr.File(label="Upload PDF file")
+            with gr.Column():
+                # set_temperature = gr.Slider(0, 1, value=0, step=0.1, label="temperature")
+                set_max_length = gr.Slider(512, 4096, value=2048, step=512, label="max length")
+                sumlocal_btn = gr.Button(value="summarize-local")
+        with gr.Row():
+            output_local = gr.Markdown(label="summary", height=1000)
+    sumlocal_btn.click(summarize_pdf, inputs=[input_path, set_max_length], outputs=output_local)
+    with gr.Tab(label="ChatBot"):
+        gr.ChatInterface(chat_with_replit,
+                       examples=[
+                           "Explain about the attention is all you need",
+                           "Who is the inventor of the GAN",
+                           "What is the main idea style transfer?"
+                       ])
+    with gr.Tab(label="Chat with pdf"):
+        gr.ChatInterface(fn = chat_with_replit_pdf,
+                         additional_inputs = [
+                             gr.Textbox(label="doi", placeholder="Enter doi number")
+                         ],
+                        type="messages")
+app.launch()