File size: 6,714 Bytes
806e7bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import streamlit as st
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFaceEndpoint
from pdfminer.high_level import extract_text
import docx2txt
import io
import re
from typing import List
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
import boto3

# AWS access credentials
access_key = 'AKIAUI7N373AFR74QX5H'
secret_key = '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'

# S3 bucket details
bucket_name = 'sentinelx-prod'
prefix = 'LOTO/Documents/LOTOFormDocuments/'

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

def extract_text_from_pdf(pdf_content):
    return extract_text(io.BytesIO(pdf_content))

def extract_text_from_doc(doc_content):
    return docx2txt.process(io.BytesIO(doc_content))

def preprocess_text(text):
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def process_files(file_contents: List[bytes]):
    all_text = ""
    for file_content in file_contents:
        if file_content.startswith(b'%PDF'):
            extracted_text = extract_text_from_pdf(file_content)
        else:
            extracted_text = extract_text_from_doc(file_content)
        preprocessed_text = preprocess_text(extracted_text)
        all_text += preprocessed_text + " "
    return all_text

def compute_cosine_similarity_scores(query, retrieved_docs):
    model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
    cosine_scores = np.dot(doc_embeddings, query_embedding.T)
    readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
    return readable_scores

def answer_query_with_similarity(query):
    try:
        # Fetch files from S3
        s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
        objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

        file_contents = []
        for obj in objects.get('Contents', []):
            if not obj['Key'].endswith('/'):  # Skip directories
                response = s3.get_object(Bucket=bucket_name, Key=obj['Key'])
                file_content = response['Body'].read()
                file_contents.append(file_content)

        all_text = process_files(file_contents)

        embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        texts = text_splitter.split_text(all_text)

        vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
        load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
        print("Vector DB Successfully Created!")

        db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
        docs = db3.similarity_search(query)
        print(f"\n\nDocuments retrieved: {len(docs)}")

        if not docs:
            print("No documents match the query.")
            return None

        docs_content = [doc.page_content for doc in docs]
        for i, content in enumerate(docs_content, start=1):
            print(f"\nDocument {i}: {content}...")

        cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
        for score in cosine_similarity_scores:
            print(f"\nDocument Score: {score['score']}")

        all_docs_content = " ".join(docs_content)

        template = """
                ### [INST] Instruction:You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:
                When someone say hi, or small talk, o only response in a sentence.
                Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
                Always maintain a positive, friendly, and encouraging tone in your interactions with users.
                Strictly write the crisp and clear answers, dont write unnecesary stuff.
                Only answer to the asked question, don't hellucinate of print any pre information.
                After providing the answer, always ask a for any other help needed in the next paragraph
                Writing in the bullet format is our top preference
                Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
                ### Docs : {docs}
                ### Question : {question}
                """
        prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))

        repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
        llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
                                  top_p=0.15,
                                  max_new_tokens=256,
                                  repetition_penalty=1.1
                                  )
        llm_chain = LLMChain(prompt=prompt, llm=llm)

        answer = llm_chain.run(question=query)
        cleaned_answer = answer.split("Answer:")[-1].strip()
        print(f"\n\nAnswer: {cleaned_answer}")

        return cleaned_answer
    except Exception as e:
        print("An error occurred while getting the answer: ", str(e))
        return None

def main():
    st.title("Document Query App")

    query = st.text_input("Enter your query:")

    if st.button("Get Answer"):
        if query:
            response = answer_query_with_similarity(query)
            if response:
                st.write("Answer:", response)
            else:
                st.write("No answer found.")
        else:
            st.write("Please provide a query.")

if __name__ == "__main__":
    main()