Spaces:

Luciferalive
/

RAG_db

Runtime error

App Files Files Community

Luciferalive commited on Jun 3, 2024

Commit

b85b4f5

verified ·

1 Parent(s): 2e3e738

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -58

app.py CHANGED Viewed

@@ -2,19 +2,19 @@ import gradio as gr
 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
 from langchain_community.llms import HuggingFaceEndpoint
-from pdfminer.high_level import extract_text
-import docx2txt
 import io
 import re
 from typing import List
 from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import SentenceTransformerEmbeddings
-from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
 import os
-import boto3
 # AWS access credentials
 access_key = os.getenv("ACCESS_KEY")
@@ -27,39 +27,52 @@ prefix = os.getenv("PREFIX")
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 def extract_text_from_pdf(pdf_content):
-    return extract_text(io.BytesIO(pdf_content))
-def extract_text_from_doc(doc_content):
-    return docx2txt.process(io.BytesIO(doc_content))
 def preprocess_text(text):
-    text = text.replace('\n', ' ').replace('\r', ' ')
-    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
-    text = text.lower()
-    text = re.sub(r'[^\w\s]', '', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
 def process_files(file_contents: List[bytes]):
     all_text = ""
     for file_content in file_contents:
-        if file_content.startswith(b'%PDF'):
-            extracted_text = extract_text_from_pdf(file_content)
-        else:
-            extracted_text = extract_text_from_doc(file_content)
         preprocessed_text = preprocess_text(extracted_text)
         all_text += preprocessed_text + " "
     return all_text
 def compute_cosine_similarity_scores(query, retrieved_docs):
     model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
     query_embedding = model.encode(query, convert_to_tensor=True)
     doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
-    cosine_scores = np.dot(doc_embeddings, query_embedding.T)
     readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
     return readable_scores
 def fetch_files_from_s3():
     s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
     objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
@@ -71,23 +84,55 @@ def fetch_files_from_s3():
             file_contents.append(file_content)
     return file_contents
-def answer_query_with_similarity(query):
-    try:
-        # Fetch files from S3
-        file_contents = fetch_files_from_s3()
-        all_text = process_files(file_contents)
-        embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
-        texts = text_splitter.split_text(all_text)
-        vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
-        load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
-        print("Vector DB Successfully Created!")
-        db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
-        docs = db3.similarity_search(query)
         print(f"\n\nDocuments retrieved: {len(docs)}")
         if not docs:
@@ -95,43 +140,44 @@ def answer_query_with_similarity(query):
             return None
         docs_content = [doc.page_content for doc in docs]
-        for i, content in enumerate(docs_content, start=1):
-            print(f"\nDocument {i}: {content}...")
         cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
-        for score in cosine_similarity_scores:
-            print(f"\nDocument Score: {score['score']}")
         all_docs_content = " ".join(docs_content)
         template = """
-                ### [INST] Instruction:You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:
-                When someone say hi, or small talk, o only response in a sentence.
-                Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
-                Always maintain a positive, friendly, and encouraging tone in your interactions with users.
-                Strictly write the crisp and clear answers, dont write unnecesary stuff.
-                Only answer to the asked question, don't hellucinate of print any pre information.
-                After providing the answer, always ask a for any other help needed in the next paragraph
-                Writing in the bullet format is our top preference
                 Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
-                ### Docs : {docs}
-                ### Question : {question}
                 """
         prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))
         repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-        llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
-                                  top_p=0.15,
-                                  max_new_tokens=256,
-                                  repetition_penalty=1.1
-                                  )
         llm_chain = LLMChain(prompt=prompt, llm=llm)
-        answer = llm_chain.run(question=query)
-        cleaned_answer = answer.split("Answer:")[-1].strip()
-        print(f"\n\nAnswer: {cleaned_answer}")
-        return cleaned_answer
     except Exception as e:
         print("An error occurred while getting the answer: ", str(e))
         return None

 from langchain.chains import LLMChain
 from langchain.prompts import PromptTemplate
 from langchain_community.llms import HuggingFaceEndpoint
+import fitz  # PyMuPDF
+import pytesseract
+from PIL import Image
 import io
 import re
+import numpy as np
+import boto3
 from typing import List
+from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import Chroma
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import SentenceTransformerEmbeddings
 import os
 # AWS access credentials
 access_key = os.getenv("ACCESS_KEY")
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 def extract_text_from_pdf(pdf_content):
+    """Extract text from PDF content using OCR."""
+    try:
+        doc = fitz.open(stream=pdf_content, filetype="pdf")
+        text = ""
+        for page in doc:
+            pix = page.get_pixmap()
+            img = Image.open(io.BytesIO(pix.tobytes()))
+            text += pytesseract.image_to_string(img)
+        return text
+    except Exception as e:
+        print("Failed to extract text from PDF:", e)
+        return ""
 def preprocess_text(text):
+    """Preprocess text by cleaning and normalizing."""
+    try:
+        text = text.replace('\n', ' ').replace('\r', ' ')
+        text = re.sub(r'[^\x00-\x7F]+', ' ', text)
+        text = text.lower()
+        text = re.sub(r'[^\w\s]', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        return text
+    except Exception as e:
+        print("Failed to preprocess text:", e)
+        return ""
 def process_files(file_contents: List[bytes]):
+    """Process and combine text from multiple files."""
     all_text = ""
     for file_content in file_contents:
+        extracted_text = extract_text_from_pdf(file_content)
         preprocessed_text = preprocess_text(extracted_text)
         all_text += preprocessed_text + " "
     return all_text
 def compute_cosine_similarity_scores(query, retrieved_docs):
+    """Compute cosine similarity scores between a query and retrieved documents."""
     model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
     query_embedding = model.encode(query, convert_to_tensor=True)
     doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
+    cosine_scores = np.dot(doc_embeddings.cpu(), query_embedding.cpu().T)
     readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
     return readable_scores
 def fetch_files_from_s3():
+    """Fetch files from an S3 bucket."""
     s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
     objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
             file_contents.append(file_content)
     return file_contents
+def create_vector_store(all_text):
+    """Create a vector store for similarity-based searching."""
+    embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    texts = text_splitter.split_text(all_text)
+    if not texts:
+        print("No text chunks created.")
+        return None
+    vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
+    print("Vector DB Successfully Created!")
+    return vector_store
+def load_vector_store():
+    """Load the vector store from the persistent directory."""
+    embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+    try:
+        db = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
+        print("Vector DB Successfully Loaded!")
+        return db
+    except Exception as e:
+        print("Failed to load Vector DB:", e)
+        return None
+def answer_query_with_similarity(query):
+    """Answer a query by finding similar documents and generating responses using a language model."""
+    try:
+        # Load the vector store
+        vector_store = load_vector_store()
+        # If vector store doesn't exist, fetch files from S3, process them, and create the vector store
+        if not vector_store:
+            file_contents = fetch_files_from_s3()
+            if not file_contents:
+                print("No files fetched from S3.")
+                return None
+            all_text = process_files(file_contents)
+            if not all_text.strip():
+                print("No text extracted from documents.")
+                return None
+            vector_store = create_vector_store(all_text)
+            if not vector_store:
+                print("Failed to create Vector DB.")
+                return None
+        # Perform similarity search
+        docs = vector_store.similarity_search(query)
         print(f"\n\nDocuments retrieved: {len(docs)}")
         if not docs:
             return None
         docs_content = [doc.page_content for doc in docs]
+        # Compute cosine similarity scores
         cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
         all_docs_content = " ".join(docs_content)
+        # Generate response using a language model
         template = """
+                ### [INST] Instruction:
+                You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:
+                - When someone says hi, or small talk, only respond in a sentence.
+                - Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
+                - Always maintain a positive, friendly, and encouraging tone in your interactions with users.
+                - Strictly write crisp and clear answers, don't write unnecessary stuff.
+                - Only answer the asked question, don't hallucinate or print any pre-information.
+                - After providing the answer, always ask for any other help needed in the next paragraph.
+                - Writing in bullet format is our top preference.
                 Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
+                ### Docs: {docs}
+                ### Question: {question}
                 """
         prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))
         repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+        llm = HuggingFaceEndpoint(
+            repo_id=repo_id,
+            temperature=0.1,
+            model_kwargs={'token': HUGGINGFACEHUB_API_TOKEN},
+            top_p=0.15,
+            max_new_tokens=256,
+            repetition_penalty=1.1
+        )
         llm_chain = LLMChain(prompt=prompt, llm=llm)
+        answer = llm_chain.run(question=query).strip()
+        print(f"\n\nAnswer: {answer}")
+        return answer
     except Exception as e:
         print("An error occurred while getting the answer: ", str(e))
         return None