File size: 2,177 Bytes
663e818
93a3da9
 
a52a9bb
663e818
4995935
2ceb5b6
93a3da9
2ceb5b6
b80a1ef
663e818
2ceb5b6
663e818
2ceb5b6
663e818
a52a9bb
663e818
f7f091e
663e818
f7f091e
a52a9bb
2ceb5b6
93a3da9
663e818
93a3da9
663e818
f7f091e
663e818
 
93a3da9
663e818
 
93a3da9
663e818
f7f091e
663e818
 
93a3da9
 
 
a52a9bb
 
663e818
93a3da9
 
4995935
93a3da9
 
 
 
 
 
 
 
 
f7f091e
b80a1ef
f7f091e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st
import torch
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
import pandas as pd
import pdfplumber

# Load the RAG model and tokenizer
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained("facebook/wiki_dpr", use_dummy_dataset=True)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever)

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_file):
    text = ""
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()

# Streamlit app
st.title("RAG-Powered PDF & CSV Chatbot")

# CSV file upload
csv_file = st.file_uploader("Upload a CSV file", type=["csv"])
csv_data = None
if csv_file:
    csv_data = pd.read_csv(csv_file)
    st.write("CSV file loaded successfully!")
    st.write(csv_data)

# PDF file upload
pdf_file = st.file_uploader("Upload a PDF file", type=["pdf"])
pdf_text = ""
if pdf_file:
    pdf_text = extract_text_from_pdf(pdf_file)
    if pdf_text:
        st.success("PDF loaded successfully!")
        st.text_area("Extracted Text from PDF", pdf_text, height=200)
    else:
        st.warning("No extractable text found in the PDF.")

# User input for chatbot
user_input = st.text_input("Ask a question related to the PDF or CSV:")

# Get response on button click
if st.button("Get Response"):
    if not pdf_text and csv_data is None:
        st.warning("Please upload a PDF or CSV file first.")
    else:
        combined_context = pdf_text
        if csv_data is not None:
            combined_context += "\n" + csv_data.to_string()

        # Generate response using RAG
        inputs = tokenizer(user_input, return_tensors="pt", truncation=True)
        with torch.no_grad():
            output = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        response = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
        st.write("### Response:")
        st.write(response)