Spaces:
Sleeping
Sleeping
import PyPDF2 | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.vectorstores import FAISS | |
def read_pdf(uploaded_file): | |
pdf_reader = PyPDF2.PdfReader(uploaded_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def Chunks(docs): | |
text_splitter = RecursiveCharacterTextSplitter( | |
# Set a really small chunk size, just to show. | |
chunk_size = 1000, | |
chunk_overlap = 100, | |
) | |
doc = text_splitter.split_text(docs) | |
return doc | |
def PDF_4_QA(file): | |
content = read_pdf(file) | |
pdf_chunks = Chunks(docs=content) | |
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', | |
model_kwargs={'device': 'cpu'}) | |
vectorstore_openai = FAISS.from_texts(pdf_chunks, embeddings) | |
return vectorstore_openai |