Luciferalive commited on
Commit
b85b4f5
·
verified ·
1 Parent(s): 2e3e738

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -58
app.py CHANGED
@@ -2,19 +2,19 @@ import gradio as gr
2
  from langchain.chains import LLMChain
3
  from langchain.prompts import PromptTemplate
4
  from langchain_community.llms import HuggingFaceEndpoint
5
- from pdfminer.high_level import extract_text
6
- import docx2txt
 
7
  import io
8
  import re
 
 
9
  from typing import List
 
10
  from langchain_community.vectorstores import Chroma
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain_community.embeddings import SentenceTransformerEmbeddings
13
- from sentence_transformers import SentenceTransformer
14
- from sklearn.metrics.pairwise import cosine_similarity
15
- import numpy as np
16
  import os
17
- import boto3
18
 
19
  # AWS access credentials
20
  access_key = os.getenv("ACCESS_KEY")
@@ -27,39 +27,52 @@ prefix = os.getenv("PREFIX")
27
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
28
 
29
  def extract_text_from_pdf(pdf_content):
30
- return extract_text(io.BytesIO(pdf_content))
31
-
32
- def extract_text_from_doc(doc_content):
33
- return docx2txt.process(io.BytesIO(doc_content))
 
 
 
 
 
 
 
 
34
 
35
  def preprocess_text(text):
36
- text = text.replace('\n', ' ').replace('\r', ' ')
37
- text = re.sub(r'[^\x00-\x7F]+', ' ', text)
38
- text = text.lower()
39
- text = re.sub(r'[^\w\s]', '', text)
40
- text = re.sub(r'\s+', ' ', text).strip()
41
- return text
 
 
 
 
 
42
 
43
  def process_files(file_contents: List[bytes]):
 
44
  all_text = ""
45
  for file_content in file_contents:
46
- if file_content.startswith(b'%PDF'):
47
- extracted_text = extract_text_from_pdf(file_content)
48
- else:
49
- extracted_text = extract_text_from_doc(file_content)
50
  preprocessed_text = preprocess_text(extracted_text)
51
  all_text += preprocessed_text + " "
52
  return all_text
53
 
54
  def compute_cosine_similarity_scores(query, retrieved_docs):
 
55
  model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
56
  query_embedding = model.encode(query, convert_to_tensor=True)
57
  doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
58
- cosine_scores = np.dot(doc_embeddings, query_embedding.T)
59
  readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
60
  return readable_scores
61
 
62
  def fetch_files_from_s3():
 
63
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
64
  objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
65
 
@@ -71,23 +84,55 @@ def fetch_files_from_s3():
71
  file_contents.append(file_content)
72
  return file_contents
73
 
74
- def answer_query_with_similarity(query):
75
- try:
76
- # Fetch files from S3
77
- file_contents = fetch_files_from_s3()
78
-
79
- all_text = process_files(file_contents)
 
 
80
 
81
- embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
82
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
83
- texts = text_splitter.split_text(all_text)
84
 
85
- vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
86
- load_vector_store = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
87
- print("Vector DB Successfully Created!")
 
 
 
 
 
 
 
88
 
89
- db3 = Chroma(persist_directory=f"stores/insurance_cosine", embedding_function=embeddings)
90
- docs = db3.similarity_search(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  print(f"\n\nDocuments retrieved: {len(docs)}")
92
 
93
  if not docs:
@@ -95,43 +140,44 @@ def answer_query_with_similarity(query):
95
  return None
96
 
97
  docs_content = [doc.page_content for doc in docs]
98
- for i, content in enumerate(docs_content, start=1):
99
- print(f"\nDocument {i}: {content}...")
100
 
 
101
  cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
102
- for score in cosine_similarity_scores:
103
- print(f"\nDocument Score: {score['score']}")
104
 
105
  all_docs_content = " ".join(docs_content)
106
 
 
107
  template = """
108
- ### [INST] Instruction:You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:
109
- When someone say hi, or small talk, o only response in a sentence.
110
- Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
111
- Always maintain a positive, friendly, and encouraging tone in your interactions with users.
112
- Strictly write the crisp and clear answers, dont write unnecesary stuff.
113
- Only answer to the asked question, don't hellucinate of print any pre information.
114
- After providing the answer, always ask a for any other help needed in the next paragraph
115
- Writing in the bullet format is our top preference
 
116
  Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
117
- ### Docs : {docs}
118
- ### Question : {question}
119
  """
120
  prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))
121
 
122
  repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
123
- llm = HuggingFaceEndpoint(repo_id=repo_id, temperature=0.1, token=HUGGINGFACEHUB_API_TOKEN,
124
- top_p=0.15,
125
- max_new_tokens=256,
126
- repetition_penalty=1.1
127
- )
 
 
 
128
  llm_chain = LLMChain(prompt=prompt, llm=llm)
129
 
130
- answer = llm_chain.run(question=query)
131
- cleaned_answer = answer.split("Answer:")[-1].strip()
132
- print(f"\n\nAnswer: {cleaned_answer}")
133
 
134
- return cleaned_answer
135
  except Exception as e:
136
  print("An error occurred while getting the answer: ", str(e))
137
  return None
 
2
  from langchain.chains import LLMChain
3
  from langchain.prompts import PromptTemplate
4
  from langchain_community.llms import HuggingFaceEndpoint
5
+ import fitz # PyMuPDF
6
+ import pytesseract
7
+ from PIL import Image
8
  import io
9
  import re
10
+ import numpy as np
11
+ import boto3
12
  from typing import List
13
+ from sentence_transformers import SentenceTransformer
14
  from langchain_community.vectorstores import Chroma
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain_community.embeddings import SentenceTransformerEmbeddings
 
 
 
17
  import os
 
18
 
19
  # AWS access credentials
20
  access_key = os.getenv("ACCESS_KEY")
 
27
  HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
28
 
29
  def extract_text_from_pdf(pdf_content):
30
+ """Extract text from PDF content using OCR."""
31
+ try:
32
+ doc = fitz.open(stream=pdf_content, filetype="pdf")
33
+ text = ""
34
+ for page in doc:
35
+ pix = page.get_pixmap()
36
+ img = Image.open(io.BytesIO(pix.tobytes()))
37
+ text += pytesseract.image_to_string(img)
38
+ return text
39
+ except Exception as e:
40
+ print("Failed to extract text from PDF:", e)
41
+ return ""
42
 
43
  def preprocess_text(text):
44
+ """Preprocess text by cleaning and normalizing."""
45
+ try:
46
+ text = text.replace('\n', ' ').replace('\r', ' ')
47
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text)
48
+ text = text.lower()
49
+ text = re.sub(r'[^\w\s]', '', text)
50
+ text = re.sub(r'\s+', ' ', text).strip()
51
+ return text
52
+ except Exception as e:
53
+ print("Failed to preprocess text:", e)
54
+ return ""
55
 
56
  def process_files(file_contents: List[bytes]):
57
+ """Process and combine text from multiple files."""
58
  all_text = ""
59
  for file_content in file_contents:
60
+ extracted_text = extract_text_from_pdf(file_content)
 
 
 
61
  preprocessed_text = preprocess_text(extracted_text)
62
  all_text += preprocessed_text + " "
63
  return all_text
64
 
65
  def compute_cosine_similarity_scores(query, retrieved_docs):
66
+ """Compute cosine similarity scores between a query and retrieved documents."""
67
  model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
68
  query_embedding = model.encode(query, convert_to_tensor=True)
69
  doc_embeddings = model.encode(retrieved_docs, convert_to_tensor=True)
70
+ cosine_scores = np.dot(doc_embeddings.cpu(), query_embedding.cpu().T)
71
  readable_scores = [{"doc": doc, "score": float(score)} for doc, score in zip(retrieved_docs, cosine_scores.flatten())]
72
  return readable_scores
73
 
74
  def fetch_files_from_s3():
75
+ """Fetch files from an S3 bucket."""
76
  s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)
77
  objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
78
 
 
84
  file_contents.append(file_content)
85
  return file_contents
86
 
87
+ def create_vector_store(all_text):
88
+ """Create a vector store for similarity-based searching."""
89
+ embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
90
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
91
+ texts = text_splitter.split_text(all_text)
92
+ if not texts:
93
+ print("No text chunks created.")
94
+ return None
95
 
96
+ vector_store = Chroma.from_texts(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/insurance_cosine")
97
+ print("Vector DB Successfully Created!")
98
+ return vector_store
99
 
100
+ def load_vector_store():
101
+ """Load the vector store from the persistent directory."""
102
+ embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
103
+ try:
104
+ db = Chroma(persist_directory="stores/insurance_cosine", embedding_function=embeddings)
105
+ print("Vector DB Successfully Loaded!")
106
+ return db
107
+ except Exception as e:
108
+ print("Failed to load Vector DB:", e)
109
+ return None
110
 
111
+ def answer_query_with_similarity(query):
112
+ """Answer a query by finding similar documents and generating responses using a language model."""
113
+ try:
114
+ # Load the vector store
115
+ vector_store = load_vector_store()
116
+
117
+ # If vector store doesn't exist, fetch files from S3, process them, and create the vector store
118
+ if not vector_store:
119
+ file_contents = fetch_files_from_s3()
120
+ if not file_contents:
121
+ print("No files fetched from S3.")
122
+ return None
123
+
124
+ all_text = process_files(file_contents)
125
+ if not all_text.strip():
126
+ print("No text extracted from documents.")
127
+ return None
128
+
129
+ vector_store = create_vector_store(all_text)
130
+ if not vector_store:
131
+ print("Failed to create Vector DB.")
132
+ return None
133
+
134
+ # Perform similarity search
135
+ docs = vector_store.similarity_search(query)
136
  print(f"\n\nDocuments retrieved: {len(docs)}")
137
 
138
  if not docs:
 
140
  return None
141
 
142
  docs_content = [doc.page_content for doc in docs]
 
 
143
 
144
+ # Compute cosine similarity scores
145
  cosine_similarity_scores = compute_cosine_similarity_scores(query, docs_content)
 
 
146
 
147
  all_docs_content = " ".join(docs_content)
148
 
149
+ # Generate response using a language model
150
  template = """
151
+ ### [INST] Instruction:
152
+ You are an AI assistant named Goose. Your purpose is to provide accurate, relevant, and helpful information to users in a friendly, warm, and supportive manner, similar to ChatGPT. When responding to queries, please keep the following guidelines in mind:
153
+ - When someone says hi, or small talk, only respond in a sentence.
154
+ - Retrieve relevant information from your knowledge base to formulate accurate and informative responses.
155
+ - Always maintain a positive, friendly, and encouraging tone in your interactions with users.
156
+ - Strictly write crisp and clear answers, don't write unnecessary stuff.
157
+ - Only answer the asked question, don't hallucinate or print any pre-information.
158
+ - After providing the answer, always ask for any other help needed in the next paragraph.
159
+ - Writing in bullet format is our top preference.
160
  Remember, your goal is to be a reliable, friendly, and supportive AI assistant that provides accurate information while creating a positive user experience, just like ChatGPT. Adapt your communication style to best suit each user's needs and preferences.
161
+ ### Docs: {docs}
162
+ ### Question: {question}
163
  """
164
  prompt = PromptTemplate.from_template(template.format(docs=all_docs_content, question=query))
165
 
166
  repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
167
+ llm = HuggingFaceEndpoint(
168
+ repo_id=repo_id,
169
+ temperature=0.1,
170
+ model_kwargs={'token': HUGGINGFACEHUB_API_TOKEN},
171
+ top_p=0.15,
172
+ max_new_tokens=256,
173
+ repetition_penalty=1.1
174
+ )
175
  llm_chain = LLMChain(prompt=prompt, llm=llm)
176
 
177
+ answer = llm_chain.run(question=query).strip()
178
+ print(f"\n\nAnswer: {answer}")
 
179
 
180
+ return answer
181
  except Exception as e:
182
  print("An error occurred while getting the answer: ", str(e))
183
  return None