Spaces:

danielsuarez-mash
/

chat_with_your_document

Sleeping

danielsuarez-mash

Made changes to app and notebook.

09f94fc 10 months ago

4.43 kB

	import streamlit as st
	from dotenv import load_dotenv
	import os
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceInstructEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.prompts import PromptTemplate
	from langchain_community.llms import HuggingFaceHub
	from langchain_core.runnables import RunnablePassthrough
	from langchain_core.output_parsers import StrOutputParser

	st.title('LLM - Retrieval Augmented Generation')

	# user-input
	pdf = st.file_uploader(label='Upload PDF')
	chunk_size = st.number_input(label='Chunk size', value=500, step=10)
	chunk_overlap = st.number_input(label='Chunk overlap', value=20, step=10)
	question = st.text_input(label='Question')

	def authenticate():

	try:
	st.write('Authenticated with HuggingFace:',
	os.environ["HUGGINGFACEHUB_API_TOKEN"] == st.secrets["HUGGINGFACEHUB_API_TOKEN"])
	except:
	st.write('Cannot find HugginFace API token. Ensure it is located in .streamlit/secrets.toml')

	def load_pdf(pdf):

	reader = PdfReader(pdf)

	# page_limit = st.number_input(label='Page limit', value=len(reader.pages), step=1)
	page_limit = len(reader.pages)

	if page_limit is None:
	page_limit=len(reader.pages)

	text = ""

	for i in range(page_limit):

	page_text = reader.pages[i].extract_text()

	text += page_text

	# if st.toggle(label='Show text'):
	# st.write(text)

	return text

	def split_text(text, chunk_size=400, chunk_overlap=20):

	# split
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	separators=["\n\n", "\n", " ", ""]
	)

	# use text_splitter to split text
	chunks = text_splitter.split_text(text)

	return chunks

	def store_text(chunks):

	# select model to create embeddings
	embeddings = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-large')

	# select vectorstore, define text chunks and embeddings model
	vectorstore = FAISS.from_texts(texts=chunks, embedding=embeddings)

	return vectorstore

	@st.cache_resource
	def load_split_store(pdf, chunk_size, chunk_overlap):

	# load split store
	text = load_pdf(pdf=pdf)
	chunks = split_text(text, chunk_size, chunk_overlap)
	vectorstore = store_text(chunks)

	return vectorstore

	def format_docs(docs):
	return "\n\n".join(doc.page_content for doc in docs)

	def main():

	# authenticate
	authenticate()

	# define new template for RAG
	rag_template = """
	You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
	Question: {question}
	Context: {context}
	Answer:
	"""

	# instantiate llm
	llm = HuggingFaceHub(
	repo_id='tiiuae/falcon-7b-instruct',
	model_kwargs={
	# 'temperature':1,
	# 'penalty_alpha':2,
	# 'top_k':50,
	# # 'max_length': 1000
	}
	)

	# build prompt
	prompt = PromptTemplate(
	template=rag_template,
	llm=llm,
	input_variables=['question', 'context']
	)

	# if a PDF exists
	if pdf is not None:

	# load split store
	vectorstore = load_split_store(pdf, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	st.write('PDF processed')

	# create a retriever using vectorstore
	retriever = vectorstore.as_retriever()

	# create retrieval chain
	retrieval_chain = (
	retriever \| format_docs
	)

	# create generation chain
	generation_chain = (
	{"context": retriever \| format_docs, "question": RunnablePassthrough()}
	\| prompt
	\| llm
	\| StrOutputParser()
	)

	# button press
	if st.button(label='Ask question'):
	with st.spinner('Processing'):

	# context
	st.write('# Context')
	st.write(retrieval_chain.invoke(question))

	# answer
	st.write('# Answer')
	st.write(generation_chain.invoke(question))



	if __name__=='__main__':
	main()