Spaces:

Alanturner2
/

Arxiv-pdf-summarization

Running

App Files Files Community

Alanturner2 commited on Dec 25, 2024

Commit

69cbfa2

verified ·

1 Parent(s): 2c8ac87

Upload app.py

Browse files

Files changed (1) hide show

app.py +229 -96

app.py CHANGED Viewed

@@ -1,113 +1,246 @@
-import os
-from langchain.document_loaders import PyPDFLoader
-from langchain import PromptTemplate
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.chains.summarize import load_summarize_chain
-from langchain.chat_models import ChatOpenAI
-def setup_documents(pdf_file_path,chunk_size,chunk_overlap):
-    loader = PyPDFLoader(pdf_file_path)
-    docs_raw = loader.load()
-    docs_raw_text = [doc.page_content for doc in docs_raw]
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
-                                                   chunk_overlap=chunk_overlap)
-    docs = text_splitter.create_documents(docs_raw_text)
-    return docs
-def custom_summary(docs, llm, custom_prompt, chain_type, num_summaries):
-    custom_prompt = custom_prompt + """:\n {text}"""
-    COMBINE_PROMPT = PromptTemplate(template=custom_prompt, input_variables = ["text"])
-    MAP_PROMPT = PromptTemplate(template="Summarize:\n{text}", input_variables=["text"])
-    if chain_type == "map_reduce":
-        chain = load_summarize_chain(llm,chain_type=chain_type,
-                                     map_prompt=MAP_PROMPT,
-                                     combine_prompt=COMBINE_PROMPT)
     else:
-        chain = load_summarize_chain(llm,chain_type=chain_type)
     summaries = []
-    for i in range(num_summaries):
-        summary_output = chain({"input_documents": docs}, return_only_outputs=True)["output_text"]
-        summaries.append(summary_output)
     return summaries
-def color_chunks(text: str, chunk_size: int, overlap_size: int) -> str:
-    overlap_color = "#808080"
-    chunk_colors = ["#a8d08d", "#c6dbef", "#e6550d", "#fd8d3c", "#fdae6b", "#fdd0a2"] # Different shades of green for chunks
-    colored_text = ""
-    overlap = ""
-    color_index = 0
-    for i in range(0, len(text), chunk_size-overlap_size):
-        chunk = text[i:i+chunk_size]
-        if overlap:
-            colored_text += f'<mark style="background-color: {overlap_color};">{overlap}</mark>'
-        chunk = chunk[len(overlap):]
-        colored_text += f'<mark style="background-color: {chunk_colors[color_index]};">{chunk}</mark>'
-        color_index = (color_index + 1) % len(chunk_colors)
-        overlap = text[i+chunk_size-overlap_size:i+chunk_size]
-    return colored_text
-def main():
-    st.set_page_config(layout="wide")
-    st.title("Custom Summarization App")
-    llm = st.sidebar.selectbox("LLM",["ChatGPT", "GPT4", "Other (open source in the future)"])
-    chain_type = st.sidebar.selectbox("Chain Type", ["map_reduce", "stuff", "refine"])
-    chunk_size = st.sidebar.slider("Chunk Size", min_value=20, max_value = 10000,
-                                   step=10, value=2000)
-    chunk_overlap = st.sidebar.slider("Chunk Overlap", min_value=5, max_value = 5000,
-                                   step=10, value=200)
-    if st.sidebar.checkbox("Debug chunk size"):
-        st.header("Interactive Text Chunk Visualization")
-        text_input = st.text_area("Input Text", "This is a test text to showcase the functionality of the interactive text chunk visualizer.")
-        # Set the minimum to 1, the maximum to 5000 and default to 100
-        html_code = color_chunks(text_input, chunk_size, chunk_overlap)
-        st.markdown(html_code, unsafe_allow_html=True)
-    else:
-        user_prompt = st.text_input("Enter the custom summary prompt")
-        pdf_file_path = st.text_input("Enther the pdf file path")
-        temperature = st.sidebar.number_input("Set the ChatGPT Temperature",
-                                              min_value = 0.0,
-                                              max_value=1.0,
-                                              step=0.1,
-                                              value=0.5)
-        num_summaries = st.sidebar.number_input("Number of summaries",
-                                                min_value = 1,
-                                                max_value = 10,
-                                                step = 1,
-                                                value=1)
-        if pdf_file_path != "":
-            docs = setup_documents(pdf_file_path, chunk_size, chunk_overlap)
-            st.write("PDF loaded successfully")
-            if llm=="ChatGPT":
-                llm = ChatOpenAI(temperature=temperature)
-            elif llm=="GPT4":
-                llm = ChatOpenAI(model_name="gpt-4",temperature=temperature)
-            else:
-                st.write("Using ChatGPT while open source models are not implemented!")
-                llm = ChatOpenAI(temperature=temperature)
-            if st.button("Summarize"):
-                result = custom_summary(docs, llm, user_prompt, chain_type, num_summaries)
-                st.write("Summary:")
-                for summary in result:
-                    st.write(summary)
-if __name__=="__main__":
-    main()

+import gradio as gr
+from langchain.document_loaders import ArxivLoader
+from PyPDF2 import PdfReader
+from langchain_community.llms import HuggingFaceHub
+from langchain.text_splitter import TokenTextSplitter
 from langchain.chains.summarize import load_summarize_chain
+from langchain.document_loaders import PyPDFLoader
+from transformers import pipeline
+from dotenv import load_dotenv
+import os
+load_dotenv()
+hugging_api_key = os.getenv('HUGGING_API_KEY')
+from groq import AsyncGroq
+from groq import Groq
+from langchain_groq import ChatGroq
+from langchain.document_loaders import ArxivLoader
+from langchain.vectorstores import Chroma
+from langchain.chains import RetrievalQA
+from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
+from huggingface_hub import login
+login(hugging_api_key)
+embedding_model = HuggingFaceHubEmbeddings(huggingfacehub_api_token=hugging_api_key)
+llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", api_key = "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")
+def display_results(result):
+    return "\n".join(result)  # Join each entry with double newlines for better readability
+def summarize_pdf(pdf_file_path, max_length):
+    summarizer = pipeline('summarization', model='allenai/led-large-16384-arxiv', min_length=100, max_length=max_length, device=0)
+    loader = PdfReader(pdf_file_path)
+    text = """ """
+    for page in loader.pages:
+        text += page.extract_text()
+    text_splitter = TokenTextSplitter(chunk_size=8192, chunk_overlap=1000)
+    chunks = text_splitter.split_text(text)
+    summary = ""
+    for i in range(len(chunks)):
+        text = chunks[i].page_content
+        summary += summarizer(text)[0]['summary_text']
+    # summary = str(max_length)
+    return summary
+def summarize_text(text):
+    sum_client = Groq(api_key="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")
+    messages = []
+    # messages.append({"role": "system", "content": "You are arxiv paper summarizer. If I give you the doi number, you should only output summarization. Summarization should be more than 10% words of the paper. For example, in the paper there are 500 words, than summarization should be more than 50 words."})
+    messages.append({"role": "system", "content": "You are summarizer. If I give you the whole text you should summarize it.  And you don't need the title and author"})
+    messages = messages + [
+        {
+            "role": "user",
+            "content": f"Summarize the paper. The whole text is {text}",
+        },
+    ]
+    response = sum_client.chat.completions.create(
+        messages=messages,
+        model="llama3-70b-8192",
+        temperature=0,
+        max_tokens=8192,
+        top_p=1,
+        stop=None
+    )
+    text_summary = response.choices[0].message.content
+    return text_summary
+def remove_first_sentence_and_title(text):
+    # Remove the first sentence
+    first_sentence_end = text.find('. ') + 2  # Find the end of the first sentence
+    text_without_first_sentence = text[first_sentence_end:]
+    # Remove the title
+    title_start = text_without_first_sentence.find('**Title:**')
+    if title_start != -1:
+        title_end = text_without_first_sentence.find('\n', title_start)
+        if title_end != -1:
+            text_without_title = text_without_first_sentence[:title_start] + text_without_first_sentence[title_end+1:]
+        else:
+            text_without_title = text_without_first_sentence[:title_start]
     else:
+        text_without_title = text_without_first_sentence
+    return text_without_title.strip()
+def summarize_arxiv_pdf(query):
+    loader = ArxivLoader(query=query, load_max_docs=10)
+    documents = loader.load()
+    text_splitter = TokenTextSplitter(chunk_size=5700, chunk_overlap=100)
+    chunks = text_splitter.split_documents(documents)
+    text = documents[0].page_content
+    ref_summary = ""
+    for i in range(len(chunks)):
+        text = chunks[i].page_content
+        ref_summary += summarize_text(text)
+    # ref_summary = ref_summary.split('paper:')[1]
+    # ref_summary = remove_first_sentence_and_title(ref_summary)
+    ref_summary = ref_summary.replace("Here is a summary of the paper:", "").strip()
+    arxiv_summary = loader.get_summaries_as_docs()
     summaries = []
+    for doc in arxiv_summary:
+        title = doc.metadata.get("Title")
+        authors = doc.metadata.get("Authors")
+        url = doc.metadata.get("Entry ID")
+        summary = doc.page_content
+        summaries.append(f"**{title}**\n")
+        summaries.append(f"**Authors:** {authors}\n")
+        summaries.append(f"**View full paper:** [Link to paper]({url})\n")
+        summaries.append(f"**Summary:** {summary}\n")
+        summaries.append(f"**Lazyman Summary:**\n ")
+        summaries.append(f"{ref_summary}")
+    summaries = display_results(summaries)
+    print(summaries)
     return summaries
+client = AsyncGroq(api_key="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")
+async def chat_with_replit(message, history):
+    messages = []
+    for chat in history:
+        user = str(chat[0])
+        assistant = str(chat[1])
+        messages.append({"role": "system", "content": "You are assistor. I will ask you some questions than you should answer!"})
+        messages.append({"role": 'user', "content": user})
+        messages.append({"role": 'assistant', "content": assistant})
+    messages = messages + [
+        {
+            "role": "user",
+            "content": str(message),
+        },
+    ]
+    print(messages)
+    response_content = ""
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model="llama3-70b-8192",
+        temperature=0,
+        max_tokens=1024,
+        top_p=1,
+        stop=None,
+        stream=True,
+    )
+    async for chunk in stream:
+        content = chunk.choices[0].delta.content
+        if content:
+            response_content += chunk.choices[0].delta.content
+        yield response_content
+js = """<script src="https://replit.com/public/js/replit-badge-v2.js" theme="dark" position="bottom-right"></script>"""
+async def chat_with_replit_pdf(message, history, doi_num):
+    messages = []
+    old_doi = "old"
+    if old_doi != doi_num:
+        loader = ArxivLoader(query=str(doi_num), load_max_docs=10)
+        documents = loader.load_and_split()
+        metadata = documents[0].metadata
+        vector_store = Chroma.from_documents(documents, embedding_model)
+        old_doi = doi_num
+    def retrieve_relevant_content(user_query):
+        results = vector_store.similarity_search(user_query, k=3)
+        relevant_content = "\n\n".join([doc.page_content for doc in results])
+        return relevant_content
+    relevant_content = retrieve_relevant_content(message)
+    messages = messages + [
+        {
+            "role": "user",
+            "content": str(message),
+        },
+        {
+            "role": "system",
+            "content": f"You should answer about this arxiv paper for {doi_num}.\n"
+            f"This is the metadata of the paper:{metadata}.\n"
+            f"This is relevant information of the paper:{relevant_content}.\n"
+        }
+    ]
+    print(messages)
+    response_content = ""
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model="llama3-70b-8192",
+        temperature=0,
+        max_tokens=1024,
+        top_p=1,
+        stop=None,
+        stream=False,
+    )
+    return stream.choices[0].message.content;
+with gr.Blocks() as app:
+    with gr.Tab(label="Arxiv summarization"):
+        with gr.Column():
+            number = gr.Textbox(label="Enter your arxiv number")
+            sumarxiv_btn = gr.Button(value="summarize-arxiv")
+        with gr.Column():
+            outputs = gr.Markdown(label="Summary")
+    sumarxiv_btn.click(summarize_arxiv_pdf, inputs=number, outputs=outputs)
+    with gr.Tab(label="Local summarization"):
+        with gr.Row():
+            with gr.Column():
+                input_path = gr.File(label="Upload PDF file")
+            with gr.Column():
+                # set_temperature = gr.Slider(0, 1, value=0, step=0.1, label="temperature")
+                set_max_length = gr.Slider(512, 4096, value=2048, step=512, label="max length")
+                sumlocal_btn = gr.Button(value="summarize-local")
+        with gr.Row():
+            output_local = gr.Textbox(label="summary")
+    sumlocal_btn.click(summarize_pdf, inputs=[input_path, set_max_length], outputs=output_local)
+    with gr.Tab(label="ChatBot"):
+        gr.ChatInterface(chat_with_replit,
+                       examples=[
+                           "Explain about the attention is all you need",
+                           "Who is the inventor of the GAN",
+                           "What is the main idea style transfer?"
+                       ])
+    with gr.Tab(label="Chat with pdf"):
+        gr.ChatInterface(fn = chat_with_replit_pdf,
+                         additional_inputs = [
+                             gr.Textbox(label="doi", placeholder="Enter doi number")
+                         ],
+                        type="messages")
+app.launch()