K00B404 commited on
Commit
0061c9d
·
verified ·
1 Parent(s): 8d1e832

Upload 9 files

Browse files
ChatBotApp.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from llm_chatbot import LLMChatBot
3
+ from streamlit_option_menu import option_menu
4
+ import speech_recognition as sr
5
+ import pyttsx3
6
+ import os
7
+ import getpass
8
+ from uuid import uuid4
9
+ import faiss
10
+ import numpy as np
11
+ import requests
12
+ import io
13
+ import warnings
14
+ import torch
15
+ import pickle
16
+ import asyncio
17
+ import json
18
+ from git import Repo
19
+ from rich import print as rp
20
+ from typing import Union, List, Generator, Any, Mapping, Optional, Dict
21
+ from requests.sessions import RequestsCookieJar
22
+ from dotenv import load_dotenv, find_dotenv
23
+ from langchain import hub
24
+ from langchain_core.documents import Document
25
+ from langchain.chains.combine_documents import create_stuff_documents_chain
26
+ from langchain.chains import create_retrieval_chain
27
+ from langchain_community.document_loaders import DirectoryLoader
28
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
29
+ from langchain_huggingface import HuggingFaceEmbeddings
30
+ from langchain_community.vectorstores import Chroma, FAISS
31
+ from langchain.vectorstores.base import VectorStore
32
+ from langchain.retrievers import MultiQueryRetriever
33
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
34
+ from langchain.llms import BaseLLM
35
+ from langchain.retrievers import ContextualCompressionRetriever
36
+ from langchain.retrievers.document_compressors import LLMChainExtractor
37
+ from langchain.retrievers.document_compressors import DocumentCompressorPipeline
38
+ from langchain_community.document_transformers import EmbeddingsRedundantFilter
39
+ from langchain_text_splitters import CharacterTextSplitter
40
+ from langchain.retrievers.document_compressors import EmbeddingsFilter
41
+ from langchain.memory.buffer import ConversationBufferMemory
42
+ from langchain.chains import StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
43
+ from uber_toolkit_class import UberToolkit
44
+ from glob import glob
45
+ import numpy as np
46
+ import pandas as pd
47
+ import plotly.graph_objects as go
48
+ import plotly.express as px
49
+ from plotly.subplots import make_subplots
50
+ import plotly.io as pio
51
+ from sklearn.decomposition import PCA
52
+ from sklearn.preprocessing import MinMaxScaler
53
+ from langchain_core.documents import Document
54
+ from scipy.stats import gaussian_kde
55
+ from huggingface_hub import InferenceClient
56
+ from hugchat import hugchat
57
+ from hugchat.login import Login
58
+ from hugchat.message import Message
59
+ from hugchat.types.assistant import Assistant
60
+ from hugchat.types.model import Model
61
+ from hugchat.types.message import MessageNode, Conversation
62
+ from langchain_community.document_loaders import TextLoader
63
+ from TTS.api import TTS
64
+ import time
65
+ from playsound import playsound
66
+ from system_prompts import __all__ as prompts
67
+
68
+ from profiler import VoiceProfileManager, VoiceProfile
69
+
70
+ # Load environment variables
71
+ load_dotenv(find_dotenv())
72
+
73
+ class ChatbotApp:
74
+ def __init__(self, email, password, default_llm=1):
75
+ self.email = email
76
+ self.password = password
77
+ self.default_llm = default_llm
78
+ self.embeddings = HuggingFaceEmbeddings(
79
+ model_name="all-MiniLM-L6-v2",
80
+ model_kwargs={'device': 'cpu'},
81
+ encode_kwargs={'normalize_embeddings': True}
82
+ )
83
+ self.vectorstore = None
84
+
85
+
86
+
87
+ def create_vectorstore_from_github(self):
88
+ repo_url = "YOUR_REPO_URL"
89
+ local_repo_path = self.clone_github_repo(repo_url)
90
+ loader = DirectoryLoader(path=local_repo_path, glob=f"**/*", show_progress=True, recursive=True)
91
+ loaded_files = loader.load()
92
+ documents = [Document(page_content=file_content) for file_content in loaded_files]
93
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
94
+ split_documents = text_splitter.split_documents(documents)
95
+ texts = [doc.page_content for doc in split_documents]
96
+ print(f"Texts for embedding: {texts}") # Debug print
97
+ self.vectorstore = FAISS.from_texts(texts, self.embeddings)
98
+
99
+ def create_vectorstore(self, docs):
100
+
101
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
102
+ # Wrap text content in Document objects
103
+ documents = [Document(page_content=doc) for doc in docs]
104
+ # Split documents using the text splitter
105
+ split_documents = text_splitter.split_documents(documents)
106
+ # Convert split documents back to plain text
107
+ texts = [doc.page_content for doc in split_documents]
108
+ vectorstore = FAISS.from_texts(texts, self.setup_embeddings())
109
+ return vectorstore
110
+
111
+ def setup_session_state(self):
112
+ if 'chat_history' not in st.session_state:
113
+ st.session_state.chat_history = []
114
+ if 'voice_mode' not in st.session_state:
115
+ st.session_state.voice_mode = False
116
+ if 'vectorstore' not in st.session_state:
117
+ st.session_state.vectorstore = None
118
+ if 'retriever' not in st.session_state:
119
+ st.session_state.retriever = None
120
+ if 'compression_retriever' not in st.session_state:
121
+ st.session_state.compression_retriever = None
122
+
123
+ def text_to_speech(self, text):
124
+ self.engine.say(text)
125
+ self.engine.runAndWait()
126
+
127
+ def speech_to_text(self):
128
+ r = sr.Recognizer()
129
+ with sr.Microphone() as source:
130
+ st.write("Listening...")
131
+ audio = r.listen(source)
132
+ try:
133
+ text = r.recognize_google(audio)
134
+ return text
135
+ except:
136
+ return "Sorry, I didn't catch that."
137
+
138
+
139
+
140
+ def setup_embeddings(self):
141
+ return HuggingFaceEmbeddings(
142
+ model_name="all-MiniLM-L6-v2",
143
+ model_kwargs={'device': 'cpu'},
144
+ encode_kwargs={'normalize_embeddings': True}
145
+ )
146
+
147
+ def create_vector_store(self, docs):
148
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
149
+ # Wrap text content in Document objects
150
+ documents = [Document(page_content=doc) for doc in docs]
151
+ # Split documents using the text splitter
152
+ split_documents = text_splitter.split_documents(documents)
153
+ print(f"Split documents: {split_documents}") # Debug print
154
+ # Convert split documents back to plain text
155
+ texts = [doc.page_content for doc in split_documents]
156
+ print(f"Texts: {texts}") # Debug print
157
+ if not texts:
158
+ print("No valid texts found for embedding. Check your repository content.")
159
+ return
160
+
161
+ try:
162
+ self.vectorstore = FAISS.from_texts(texts, self.embeddings)
163
+ print("Vector store created successfully")
164
+ except Exception as e:
165
+ print(f"Error creating vector store: {str(e)}")
166
+
167
+
168
+ def setup_retriever(self, k=5, similarity_threshold=0.76):
169
+ self.retriever = st.session_state.vectorstore.as_retriever(k=k)
170
+ splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
171
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.setup_embeddings())
172
+ relevant_filter = EmbeddingsFilter(embeddings=self.setup_embeddings(), similarity_threshold=similarity_threshold)
173
+ pipeline_compressor = DocumentCompressorPipeline(
174
+ transformers=[splitter, redundant_filter, relevant_filter]
175
+ )
176
+ st.session_state.compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=self.retriever)
177
+
178
+ def create_retrieval_chain(self):
179
+ rag_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
180
+ combine_docs_chain = create_stuff_documents_chain(self.llm, rag_prompt)
181
+ self.high_retrieval_chain = create_retrieval_chain(st.session_state.compression_retriever, combine_docs_chain)
182
+ self.low_retrieval_chain = create_retrieval_chain(self.retriever, combine_docs_chain)
183
+
184
+ def setup_tts(self, model_name="tts_models/en/ljspeech/fast_pitch"):
185
+ self.tts = TTS(model_name=model_name, progress_bar=False, vocoder_path='vocoder_models/en/ljspeech/univnet')
186
+
187
+ def setup_speech_recognition(self):
188
+ self.recognizer = sr.Recognizer()
189
+
190
+ def setup_folders(self):
191
+ self.dirs = ["test_input", "vectorstore", "test"]
192
+ for d in self.dirs:
193
+ os.makedirs(d, exist_ok=True)
194
+
195
+ def send_message(self, message, web=False):
196
+ message_result = self.llm.chat(message, web_search=web)
197
+ return message_result.wait_until_done()
198
+
199
+ def stream_response(self, message, web=False, stream=False):
200
+ responses = []
201
+ for resp in self.llm.query(message, stream=stream, web_search=web):
202
+ responses.append(resp['token'])
203
+ return ' '.join(responses)
204
+
205
+ def web_search(self, text):
206
+ result = self.send_message(text, web=True)
207
+ return result
208
+
209
+ def retrieve_context(self, query: str):
210
+ context = []
211
+ lowres = self.retriever._get_relevant_documents(query)
212
+ highres = st.session_state.compression_retriever.get_relevant_documents(query)
213
+ context = "\n".join([doc.page_content for doc in lowres + highres])
214
+ return context
215
+
216
+ def get_conversation_chain(self):
217
+ EMAIL = os.getenv("EMAIL")
218
+ PASSWD = os.getenv("PASSWD")
219
+ model = 1
220
+ self.llm = LLMChatBot(EMAIL, PASSWD, default_llm=model)
221
+ self.llm.create_new_conversation(system_prompt=self.llm.default_system_prompt, switch_to=True)
222
+
223
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
224
+ conversation_chain = ConversationalRetrievalChain.from_llm(
225
+ llm=self.llm,
226
+ retriever=st.session_state.vectorstore.as_retriever(),
227
+ memory=memory
228
+ )
229
+ return conversation_chain
230
+
231
+ async def handle_user_input(self, user_input):
232
+ response = st.session_state.conversation({'question': user_input})
233
+ st.session_state.chat_history = response['chat_history']
234
+
235
+ for i, message in enumerate(st.session_state.chat_history):
236
+ if i % 2 == 0:
237
+ st.write(f"Human: {message.content}")
238
+ else:
239
+ st.write(f"AI: {message.content}")
240
+ if st.session_state.voice_mode:
241
+ self.text_to_speech(message.content)
242
+
243
+ def clone_github_repo(self, repo_url, local_path='./repo'):
244
+ if os.path.exists(local_path):
245
+ st.write("Repository already cloned.")
246
+ return local_path
247
+ Repo.clone_from(repo_url, local_path)
248
+ return local_path
249
+
250
+ def glob_recursive_multiple_extensions(base_dir, extensions):
251
+ all_files = []
252
+ for ext in extensions:
253
+ pattern = os.path.join(base_dir, '**', f'*.{ext}')
254
+ files = glob(pattern, recursive=True)
255
+ all_files.extend(files)
256
+ return all_files
257
+
258
+ def load_documents_from_github(self, repo_url, file_types=['*.py', '*.md', '*.txt', '*.html']):
259
+ local_repo_path = self.clone_github_repo(repo_url)
260
+ globber=f"**/*/{{{','.join(file_types)}}}"
261
+ rp(globber)
262
+ loader = DirectoryLoader(path=local_repo_path, glob=globber, show_progress=True, recursive=True,loader_cls=TextLoader)
263
+ loaded_files = loader.load()
264
+ st.write(f"Nr. files loaded: {len(loaded_files)}")
265
+ print(f"Loaded files: {len(loaded_files)}") # Debug print
266
+
267
+ # Convert the loaded files to Document objects
268
+ documents = [Document(page_content=file_content) for file_content in loaded_files]
269
+ print(f"Documents: {documents}") # Debug print
270
+
271
+ return documents
272
+
273
+ def split_documents(self, documents, chunk_s=512, chunk_o=0):
274
+ split_docs = []
275
+ splitter=None
276
+ for doc in documents:
277
+ ext = os.path.splitext(getattr(doc, 'source', '') or getattr(doc, 'filename', ''))[1].lower()
278
+ if ext == '.py':
279
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=chunk_s, chunk_overlap=chunk_o)
280
+ elif ext in ['.md', '.markdown']:
281
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_s, chunk_overlap=chunk_o)
282
+ elif ext in ['.html', '.htm']:
283
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.HTML, chunk_size=chunk_s, chunk_overlap=chunk_o)
284
+ else:
285
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_s, chunk_overlap=chunk_o)
286
+ split_docs.extend(splitter.split_documents([doc]))
287
+ return split_docs, splitter
288
+
289
+ def visualize_vectorstore(self):
290
+ if st.session_state.vectorstore is None:
291
+ st.write("Vectorstore is not initialized.")
292
+ return
293
+
294
+ documents = st.session_state.vectorstore.get_all_documents()
295
+ embeddings = [doc.embedding for doc in documents]
296
+
297
+ pca = PCA(n_components=3)
298
+ embeddings_3d = pca.fit_transform(embeddings)
299
+
300
+ scaler = MinMaxScaler()
301
+ embeddings_3d_normalized = scaler.fit_transform(embeddings_3d)
302
+
303
+ colors = embeddings_3d_normalized[:, 0]
304
+
305
+ hover_text = [f"Document {i}:<br>{doc.page_content[:100]}..." for i, doc in enumerate(documents)]
306
+
307
+ fig = go.Figure(data=[go.Scatter3d(
308
+ x=embeddings_3d_normalized[:, 0],
309
+ y=embeddings_3d_normalized[:, 1],
310
+ z=embeddings_3d_normalized[:, 2],
311
+ mode='markers',
312
+ marker=dict(
313
+ size=5,
314
+ color=colors,
315
+ colorscale='Viridis',
316
+ opacity=0.8
317
+ ),
318
+ text=hover_text,
319
+ hoverinfo='text'
320
+ )])
321
+
322
+ fig.update_layout(
323
+ title="Interactive 3D Vectorstore Document Distribution",
324
+ scene=dict(
325
+ xaxis_title="PCA Component 1",
326
+ yaxis_title="PCA Component 2",
327
+ zaxis_title="PCA Component 3"
328
+ ),
329
+ width=800,
330
+ height=600,
331
+ )
332
+
333
+ st.plotly_chart(fig)
334
+
335
+ def chatbot_page(self):
336
+ st.title("Chatbot")
337
+
338
+ # Toggle for voice mode
339
+ st.session_state.voice_mode = st.toggle("Voice Mode")
340
+
341
+ # File uploader for context injection
342
+ uploaded_file = st.file_uploader("Choose a file for context injection")
343
+ if uploaded_file is not None:
344
+ documents = [uploaded_file.read().decode()]
345
+ st.session_state.vectorstore = self.create_vector_store(documents)
346
+ st.session_state.conversation = self.get_conversation_chain()
347
+
348
+ # GitHub repository URL input
349
+ repo_url = st.text_input("Enter GitHub repository URL")
350
+ if repo_url:
351
+ documents = self.load_documents_from_github(repo_url)
352
+ split_docs, _ = self.split_documents(documents)
353
+ st.session_state.vectorstore = self.create_vector_store(split_docs)
354
+ st.session_state.conversation = self.get_conversation_chain()
355
+
356
+ # Chat interface
357
+ user_input = st.text_input("You: ", key="user_input")
358
+
359
+ if user_input:
360
+ asyncio.run(self.handle_user_input(user_input))
361
+
362
+ if st.session_state.voice_mode:
363
+ if st.button("Speak"):
364
+ user_speech = self.speech_to_text()
365
+ st.text_input("You: ", value=user_speech, key="user_speech_input")
366
+ if user_speech != "Sorry, I didn't catch that.":
367
+ asyncio.run(self.handle_user_input(user_speech))
368
+
369
+ def dashboard_page(self):
370
+ st.title("Dashboard")
371
+
372
+ if st.session_state.vectorstore is not None:
373
+ st.write("Vectorstore Visualization")
374
+ self.visualize_vectorstore()
375
+ else:
376
+ st.write("Vectorstore is not initialized. Please add documents in the Chatbot page.")
377
+
378
+ def main(self):
379
+ st.set_page_config(page_title="Enhanced Multi-page Chatbot App", layout="wide")
380
+
381
+ # Sidebar navigation
382
+ with st.sidebar:
383
+ selected = option_menu(
384
+ menu_title="Navigation",
385
+ options=["Chatbot", "Dashboard"],
386
+ icons=["chat", "bar-chart"],
387
+ menu_icon="cast",
388
+ default_index=0,
389
+ )
390
+
391
+ if selected == "Chatbot":
392
+ self.chatbot_page()
393
+ elif selected == "Dashboard":
394
+ self.dashboard_page()
395
+
396
+
397
+ if __name__ == "__main__":
398
+ app = ChatbotApp(os.getenv("EMAIL"),os.getenv("PASSWD"))
399
+ app.main()
FaissStorage.py ADDED
@@ -0,0 +1,954 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from datetime import datetime
4
+ import webbrowser
5
+ from tkinter import Toplevel
6
+ import warnings
7
+ import faiss,logging
8
+ import numpy as np
9
+ import wandb
10
+ from typing import List, Dict, Any, Optional, Union
11
+ from git import Repo
12
+ import plotly.graph_objects as go
13
+ import numpy as np
14
+ from sklearn.decomposition import PCA
15
+ import requests
16
+ from rich import print as rp
17
+ from rich.progress import Progress, TextColumn, BarColumn, TimeRemainingColumn
18
+ from dotenv import load_dotenv, find_dotenv
19
+ import speech_recognition
20
+ from TTS.api import TTS
21
+ from sklearn.decomposition import PCA
22
+ from playsound import playsound
23
+ from hugchat import hugchat
24
+ from hugchat.login import Login
25
+ from langchain_core.documents import Document
26
+
27
+
28
+ from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
29
+ from langchain_community.llms.huggingface_hub import HuggingFaceHub
30
+ from langchain.chains.combine_documents import create_stuff_documents_chain
31
+ from langchain.chains import create_retrieval_chain
32
+ from langchain_community.document_loaders import (
33
+ PyPDFLoader,
34
+ UnstructuredHTMLLoader,
35
+ UnstructuredWordDocumentLoader,
36
+ TextLoader,
37
+ PythonLoader
38
+ )
39
+ from langchain.retrievers import TimeWeightedVectorStoreRetriever
40
+ from langchain_community.docstore.in_memory import InMemoryDocstore
41
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language,CharacterTextSplitter
42
+ from langchain_huggingface import HuggingFaceEmbeddings
43
+ from langchain_community.vectorstores import FAISS
44
+ from langchain.vectorstores.base import VectorStore
45
+ from langchain.retrievers import MultiQueryRetriever, ContextualCompressionRetriever
46
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
47
+ from langchain.retrievers.document_compressors import LLMChainExtractor, DocumentCompressorPipeline
48
+ from langchain_community.document_transformers import EmbeddingsRedundantFilter
49
+ from langchain.retrievers.document_compressors import EmbeddingsFilter
50
+ import plotly.graph_objs as go
51
+
52
+
53
+ from langchain.chains import LLMChain
54
+ # Load environment variables
55
+ load_dotenv(find_dotenv())
56
+ warnings.filterwarnings("ignore")
57
+ os.environ['FAISS_NO_AVX2'] = '1'
58
+ os.environ["USER_AGENT"] = os.getenv("USER_AGENT")
59
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")
60
+ wandb.require("core")
61
+ # Import system prompts
62
+ from system_prompts import __all__ as prompts
63
+
64
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GPT2LMHeadModel, GPT2TokenizerFast
65
+ from langchain_huggingface import HuggingFacePipeline
66
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
67
+
68
+
69
+
70
+
71
+ class LLMChatBot:
72
+ def __init__(self, email, password, cookie_path_dir='./cookies/', default_llm=1, default_system_prompt='default_rag_prompt'):
73
+ self.email = email
74
+ self.password = password
75
+ self.current_model = 1
76
+ self.current_system_prompt=default_system_prompt
77
+ self.cookie_path_dir = cookie_path_dir
78
+ self.cookies = self.login()
79
+ self.default_llm = default_llm
80
+ self.chatbot = hugchat.ChatBot(cookies=self.cookies.get_dict(), default_llm=default_llm,system_prompt=prompts[default_system_prompt])
81
+ self.conversation_id=None
82
+ self.check_conv_id(self.conversation_id)
83
+ rp("[self.conversation_id:{self.conversation_id}]")
84
+
85
+
86
+ def check_conv_id(self, id=None):
87
+ if not self.conversation_id and not id:
88
+ self.conversation_id = self.chatbot.new_conversation(modelIndex=self.current_model,system_prompt=self.current_system_prompt)
89
+ else:
90
+ if id:
91
+ self.conversation_id=id
92
+ self.chatbot.change_conversation(self.conversation_id)
93
+ elif not self.chatbot.get_conversation_info(self.conversation_id) == self.chatbot.get_conversation_info():
94
+ self.chatbot.change_conversation(self.conversation_id)
95
+
96
+ return self.conversation_id
97
+
98
+ def login(self):
99
+ rp("Attempting to log in...")
100
+ sign = Login(self.email, self.password)
101
+ try:
102
+ cookies = sign.login(cookie_dir_path=self.cookie_path_dir, save_cookies=True)
103
+ rp("Login successful!")
104
+ return cookies
105
+ except Exception as e:
106
+ rp(f"Login failed: {e}")
107
+ rp("Attempting manual login with requests...")
108
+ self.manual_login()
109
+ raise
110
+
111
+ def manual_login(self):
112
+ login_url = "https://huggingface.co/login"
113
+ session = requests.Session()
114
+ response = session.get(login_url)
115
+ rp("Response Cookies:", response.cookies)
116
+ rp("Response Content:", response.content.decode())
117
+
118
+ csrf_token = response.cookies.get('csrf_token')
119
+ if not csrf_token:
120
+ rp("CSRF token not found in cookies.")
121
+ return
122
+
123
+ login_data = {
124
+ 'email': self.email,
125
+ 'password': self.password,
126
+ 'csrf_token': csrf_token
127
+ }
128
+
129
+ response = session.post(login_url, data=login_data)
130
+ if response.ok:
131
+
132
+ rp("Manual login successful!")
133
+ else:
134
+ rp("Manual login failed!")
135
+
136
+ def setup_speech_recognition(self):
137
+ self.recognizer = speech_recognition.Recognizer()
138
+
139
+ def setup_tts(self, model_name="tts_models/en/ljspeech/fast_pitch"):
140
+ self.tts = TTS(model_name=model_name)
141
+
142
+ def chat(self, message):
143
+ return self.chatbot.chat(message)
144
+
145
+ def query(self,message, web_search=False, stream=False,use_cache=True):
146
+ return self.chatbot.query(
147
+ text=message,
148
+ web_search = web_search,
149
+ temperature = 0.1,
150
+ top_p = 0.95,
151
+ repetition_penalty = 1.2,
152
+ top_k = 50,
153
+ truncate = 1000,
154
+ watermark = False,
155
+ max_new_tokens = 1024,
156
+ stop = ["</s>"],
157
+ return_full_text = False,
158
+ stream = stream,
159
+ _stream_yield_all = False,
160
+ use_cache = False,
161
+ is_retry = False,
162
+ retry_count = 5,
163
+ conversation = None
164
+ )
165
+
166
+ def stream_response(self, message):
167
+ for resp in self.query(message, stream=True):
168
+ rp(resp)
169
+
170
+ def web_search(self, query):
171
+ query_result = self.query(query, web_search=True)
172
+ results = []
173
+ for source in query_result.web_search_sources:
174
+ results.append({
175
+ 'link': source.link,
176
+ 'title': source.title,
177
+ 'hostname': source.hostname
178
+ })
179
+ return results
180
+
181
+ def create_new_conversation(self, switch_to=True):
182
+ return self.chatbot.new_conversation(switch_to=switch_to, modelIndex=self.current_model, system_prompt=self.current_system_prompt)
183
+
184
+ def get_remote_conversations(self):
185
+ return self.chatbot.get_remote_conversations(replace_conversation_list=True)
186
+
187
+ def get_local_conversations(self):
188
+ return self.chatbot.get_conversation_list()
189
+
190
+ def get_available_models(self):
191
+ return self.chatbot.get_available_llm_models()
192
+
193
+ def switch_model(self, index):
194
+ self.chatbot.switch_llm(index)
195
+
196
+ def switch_conversation(self, id):
197
+ self.conv_id = id
198
+ self.chatbot.change_conversation(self.conv_id)
199
+
200
+ def get_assistants(self):
201
+ return self.chatbot.get_assistant_list_by_page(1)
202
+
203
+ def switch_role(self, system_prompt, model_id=1):
204
+ self.chatbot.delete_all_conversations()
205
+ self.check_conv_id = self.chatbot.new_conversation(switch_to=True, system_prompt=system_prompt, modelIndex=model_id)
206
+ return self.check_conv_id
207
+
208
+ def __run__(self, message):
209
+ if not self.conversation_id:
210
+ self.conversation_id = self.chatbot.new_conversation(modelIndex=self.current_model,
211
+ system_prompt=self.current_system_prompt,
212
+ switch_to=True)
213
+ return self.query(message)
214
+
215
+ def __call__(self, message):
216
+ if not self.conversation_id:
217
+ self.conversation_id = self.chatbot.new_conversation(modelIndex=self.current_model,
218
+ system_prompt=self.current_system_prompt,
219
+ switch_to=True)
220
+ return self.chat(message)
221
+
222
+
223
+
224
+
225
+ class AdvancedVectorStore:
226
+ def __init__(self,
227
+ embedding_model: str = "all-MiniLM-L6-v2",
228
+ email: str = None,
229
+ password: str = None,
230
+ chunk_size=384,
231
+ chunk_overlap=0,
232
+ device='cpu',
233
+ normalize_embeddings=True,
234
+ log_level=logging.INFO,
235
+ log_file='AdvancedVectorStore.log',
236
+ logs_dir='./logs',
237
+ test_input='./test_input',
238
+ test_output='./test_output',
239
+ storage_dir='./vectorstore',
240
+ knowledge_dir='./knowledge',
241
+ repos_dir='./repos'
242
+ ):
243
+
244
+ self.chunk_size = chunk_size
245
+ self.chunk_overlap = chunk_overlap
246
+ self.device = device
247
+ self.basic_splitter= RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
248
+ self.storage_dir=storage_dir
249
+ self.test_input=test_input
250
+ self.test_output=test_output
251
+ self.repos_dir=repos_dir
252
+ self.knowledge_dir=knowledge_dir
253
+ self.logs_dir=logs_dir
254
+ self.log_file=log_file
255
+ self.doc_ids = []
256
+ self.documents: List[Document] = []
257
+ self.embeddings = HuggingFaceEmbeddings(
258
+ model_name=embedding_model,
259
+ model_kwargs={'device': self.device},
260
+ encode_kwargs={'normalize_embeddings': normalize_embeddings}
261
+ )
262
+
263
+ self.qwen_llm = HuggingFaceHub(repo_id="Qwen/Qwen2-0.5B-Instruct", model_kwargs={"temperature": 0.5, "max_length": 512})
264
+ self.llm = HuggingFaceHub(repo_id="google-t5/t5-small", model_kwargs={"temperature": 0.5, "max_length": 512})
265
+ self.alpaca_llm = HuggingFaceHub(repo_id="reasonwang/google-flan-t5-small-alpaca", model_kwargs={"temperature": 0.1, "max_length": 512})
266
+ self.chatbot_llm = LLMChatBot(email, password, default_system_prompt= 'copilot_prompt') if email and password else None
267
+
268
+ rp("create_indexed_vectorstore:")
269
+ print(self.alpaca_llm("What is Deep Learning?"))
270
+
271
+ self.vectorstore, self.docstore, self.index = self.create_indexed_vectorstore(self.chunk_size)
272
+
273
+ self.document_count = 0
274
+ self.chunk_count = 0
275
+ self.setup_folders()
276
+ self.setup_logging(log_level,os.path.join(self.logs_dir,self.log_file))
277
+ self.logger.info("Initializing AdvancedVectorStore")
278
+ self.set_bot_role()
279
+
280
+ def setup_logging(self,level,file):
281
+ self.logger = logging.getLogger(__name__)
282
+ self.logger.setLevel(level)
283
+ # Create console handler and set level
284
+ ch = logging.StreamHandler()
285
+ ch.setLevel(level)
286
+ # Create file handler and set level
287
+ fh = logging.FileHandler(file)
288
+ fh.setLevel(level)
289
+ # Create formatter
290
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
291
+ # Add formatter to console handler
292
+ ch.setFormatter(formatter)
293
+ # Add formatter to file handler
294
+ fh.setFormatter(formatter)
295
+ # Add handlers to logger
296
+ self.logger.addHandler(ch)
297
+ self.logger.addHandler(fh)
298
+ self.logger.info("Done settingload_documents_folder up logger for {__name__} [AdvancedVectorStore]")
299
+
300
+ def setup_folders(self):
301
+ self.dirs = [
302
+ self.test_input,
303
+ self.test_output,
304
+ self.logs_dir,
305
+ self.storage_dir,
306
+ self.knowledge_dir,
307
+ self.repos_dir
308
+ ]
309
+ for d in self.dirs:
310
+ os.makedirs(d, exist_ok=True)
311
+
312
+ def set_bot_role(self,prompt='default_rag_prompt',context="",history=""):
313
+ self.chatbot_llm.current_system_prompt = prompts[prompt].replace("<<VSCONTEXT>>",context).replace("<<WSCONTEXT>>",history)
314
+ self.current_conversation_id=self.chatbot_llm.chatbot.new_conversation(system_prompt=self.chatbot_llm.current_system_prompt,
315
+ modelIndex=self.chatbot_llm.current_model,
316
+ switch_to=True)
317
+ #self.logger.info(f"Setting Bot Role!\n[{prompt}]")
318
+ """ result=self.chatbot_llm("Confirm you understand the TASK.")
319
+ self.logger.info(f"Test results chatbot role set:{result}") """
320
+ #rp(f"[Result:{result}]")
321
+
322
+
323
+ def load_documents(self, directory: str) -> None:
324
+ """Load documents from a directory with specific loaders for each file type."""
325
+ loaders = {
326
+ ".py": (PythonLoader, {}),
327
+ ".txt": (TextLoader, {}),
328
+ ".pdf": (PyPDFLoader, {}),
329
+ ".html": (UnstructuredHTMLLoader, {}),
330
+ ".docx": (UnstructuredWordDocumentLoader, {})
331
+ }
332
+
333
+ for root, _, files in os.walk(directory):
334
+ for file in files:
335
+ file_path = os.path.join(root, file)
336
+ file_extension = os.path.splitext(file)[1].lower()
337
+
338
+ if file_extension in loaders:
339
+ # Check if the file can be read as UTF-8
340
+ try:
341
+ with open(file_path, 'r', encoding='utf-8') as f:
342
+ f.read()
343
+ except (UnicodeDecodeError, IOError):
344
+ rp(f"Skipping non-UTF-8 or unreadable file: {file_path}")
345
+ continue
346
+
347
+ loader_class, loader_args = loaders[file_extension]
348
+ loader = loader_class(file_path, **loader_args)
349
+ self.documents.extend(loader.load())
350
+
351
+ def split_documents(self) -> None:
352
+ """Split documents using appropriate splitters for each file type."""
353
+ splitters = {
354
+ ".py": RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
355
+ ".txt": RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
356
+ ".pdf": RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
357
+ ".html": RecursiveCharacterTextSplitter.from_language(language=Language.HTML, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap),
358
+ ".docx": RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)
359
+ }
360
+
361
+ split_docs = []
362
+ for doc in self.documents:
363
+ file_extension = os.path.splitext(doc.metadata.get("source", ""))[1].lower()
364
+ splitter = splitters.get(file_extension, RecursiveCharacterTextSplitter(chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap))
365
+ split_docs.extend(splitter.split_documents([doc]))
366
+
367
+ self.documents = split_docs
368
+
369
+ def create_vectorstore(self, store_type: str = "FAISS") -> None:
370
+ """Create a vectorstore of the specified type."""
371
+ if store_type == "FAISS":
372
+ self.vectorstore = FAISS.from_documents(self.documents, self.embeddings)
373
+ else:
374
+ raise ValueError(f"Unsupported vectorstore type: {store_type}")
375
+
376
+ def create_indexed_vectorstore(self,embedding_size):
377
+ rp("Creating indexed vectorstore...")
378
+ #embedding_size = 384 # Size for all-MiniLM-L6-v2 embeddings
379
+ index = faiss.IndexFlatL2(embedding_size)
380
+ docstore = InMemoryDocstore({})
381
+ vectorstore = FAISS(
382
+ self.embeddings.embed_query,
383
+ index,
384
+ docstore,
385
+ {}
386
+ )
387
+ rp("Indexed vectorstore created.")
388
+ return vectorstore, docstore, index
389
+
390
+ def get_self_query_retriever(self, k: int = 4) -> SelfQueryRetriever:
391
+ """Get a SelfQueryRetriever."""
392
+ if not self.vectorstore:
393
+ raise ValueError("Vectorstore not initialized. Call create_vectorstore() first.")
394
+ return SelfQueryRetriever.from_llm(
395
+ self.chatbot_llm.chatbot,
396
+ self.vectorstore,
397
+ document_contents="Document about various topics.",
398
+ metadata_field_info=[],
399
+ search_kwargs={"k": k}
400
+ )
401
+
402
+
403
+ def get_contextual_t5_compression_retriever(self, k: int = 4, similarity_threshold=0.78) -> ContextualCompressionRetriever:
404
+ """Get a ContextualCompressionRetriever."""
405
+ base_compressor = LLMChainExtractor.from_llm(self.llm)
406
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
407
+ relevant_filter = EmbeddingsFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
408
+ return ContextualCompressionRetriever(
409
+ name="CompressedRetriever",
410
+ base_compressor=DocumentCompressorPipeline(transformers=[self.basic_splitter, base_compressor, redundant_filter, relevant_filter]),
411
+ base_retriever=self.get_basic_retriever(k=k)
412
+ )
413
+
414
+ def get_contextual_qwen_compression_retriever(self, k=4, similarity_threshold=0.78):
415
+ # Initialize the components for the compressor pipeline
416
+ base_compressor = LLMChainExtractor.from_llm(self.qwen_llm)
417
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
418
+ relevant_filter = EmbeddingsFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
419
+ # Create the ContextualCompressionRetriever
420
+ return ContextualCompressionRetriever(
421
+ name="CompressedRetriever",
422
+ base_compressor= DocumentCompressorPipeline(transformers=[self.basic_splitter, base_compressor, redundant_filter, relevant_filter]),
423
+ base_retriever=self.get_basic_retriever(k=k)
424
+ )
425
+
426
+ def get_contextual_compression_retriever(self, k: int = 4,similarity_threshold=0.78) -> ContextualCompressionRetriever:
427
+ """Get a ContextualCompressionRetriever."""
428
+ base_compressor = LLMChainExtractor.from_llm(self.alpaca_llm)
429
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
430
+ relevant_filter = EmbeddingsFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
431
+ return ContextualCompressionRetriever(
432
+ name="CompressedRetriever",
433
+ base_compressor=DocumentCompressorPipeline(transformers=[self.basic_splitter, base_compressor, redundant_filter, relevant_filter]),
434
+ base_retriever=self.get_basic_retriever(k=k)
435
+ )
436
+
437
+
438
+ def get_basic_retriever(self, k: int = 4) -> VectorStore:
439
+ """Get a basic retriever from the vectorstore."""
440
+ if not self.vectorstore:
441
+ raise ValueError("Vectorstore not initialized. Call create_vectorstore() first.")
442
+ return self.vectorstore.as_retriever(search_kwargs={"k": k})
443
+ def get_multi_query_retriever(self, k: int = 4) -> MultiQueryRetriever:
444
+ """Get a MultiQueryRetriever."""
445
+ if not self.vectorstore:
446
+ raise ValueError("Vectorstore not initialized. Call create_vectorstore() first.")
447
+ return MultiQueryRetriever.from_llm(
448
+ retriever=self.vectorstore.as_retriever(search_kwargs={"k": k}),
449
+ llm=self.chatbot_llm
450
+ )
451
+ def get_timed_retriever(self, k=1, decay_rate=0.0000000000000000000000001):
452
+ return TimeWeightedVectorStoreRetriever(
453
+ vectorstore=self.vectorstore, decay_rate=decay_rate, k=k
454
+ )
455
+
456
+ def set_current_retriever(self,mode='basic',k=4,sim_rate=0.78):
457
+ if mode == 'compressed':
458
+ retriever = self.get_contextual_compression_retriever(k, sim_rate)
459
+ elif mode == 'qwen_compressed':
460
+ retriever = self.get_contextual_qwen_compression_retriever(k, sim_rate)
461
+ elif mode == 't5_compressed':
462
+ retriever = self.get_contextual_t5_compression_retriever(k, sim_rate)
463
+ elif mode == 'self_query':
464
+ retriever = self.get_self_query_retriever(k)
465
+ elif mode == 'multi_query':
466
+ retriever = self.get_multi_query_retriever(k)
467
+ elif mode == 'time':
468
+ retriever = self.get_timed_retriever(k=1)
469
+ else:
470
+ retriever = self.get_basic_retriever(k)
471
+
472
+ #rp(retriever.get_prompts)
473
+ return retriever
474
+
475
+ def search(self, query: str, mode='basic', retriever: Optional[Any] = None, k: int = 4, sim_rate: float = 0.78) -> List[Document]:
476
+ """Search the vectorstore using the specified retriever."""
477
+ if not retriever:
478
+ retriever = self.set_current_retriever(mode=mode, k=k, sim_rate=sim_rate)
479
+ return retriever.get_relevant_documents(query)
480
+
481
+ def add_documents(self, documents: List[Document]) -> None:
482
+ import uuid
483
+
484
+ """Add new documents to the existing vectorstore."""
485
+ with Progress(
486
+ TextColumn("[progress.description]{task.description}"),
487
+ BarColumn(),
488
+ TextColumn("[green][progress.percentage]{task.percentage:>3.0f}%"),
489
+ TimeRemainingColumn()
490
+ ) as progress:
491
+ task = progress.add_task("[cyan]Adding documents to vectorstore...", total=len(documents))
492
+
493
+ for id, doc in enumerate(documents):
494
+ #self.vectorstore.add_documents([doc])
495
+ metadata = doc.metadata
496
+ if not metadata:
497
+ metadata = {}
498
+
499
+ metadata["last_accessed_at"] = datetime.now()
500
+ new_doc = Document(page_content=doc.page_content, metadata=metadata)
501
+ nr=id
502
+ id = str(uuid.uuid4())
503
+
504
+
505
+ self.vectorstore.docstore.add({id: new_doc})
506
+
507
+ self.doc_ids.append(id)
508
+ self.set_current_retriever(mode='time', k=1).add_documents([new_doc])
509
+ total = self.index.ntotal
510
+ #self.logger.info(f"Added doc to vectorstore {new_doc.metadata['last_accessed_at']} with {total} id's so far.")
511
+ self.logger.info(f"Added doc to docstore[{nr}/{len(self.documents)}] with Id:{id} Path:{new_doc.metadata['source']}")
512
+ progress.update(task, advance=1)
513
+
514
+ rp(f"Added {len(documents)} documents to the vectorstore with index in doc_ids.")
515
+
516
+ def delete_documents(self, document_ids: List[str]) -> None:
517
+ """Delete documents from the vectorstore by their IDs."""
518
+
519
+ for id in document_ids:
520
+ #self.logger.info(f"[Deleting DocumenId{id}...]")
521
+ self.vectorstore.delete(document_ids)
522
+ #self.logger.info(f"[Done! Saving Faiss...{id}]")
523
+
524
+ def save_vectorstore(self, path: str) -> None:
525
+ """Save the vectorstore to disk."""
526
+ if not self.vectorstore:
527
+ raise ValueError("Vectorstore not initialized. Call create_vectorstore() first.")
528
+ #self.logger.info("[Saving Faiss...]")
529
+ self.vectorstore.save_local(path)
530
+ #self.logger.info(f"[Done! Saving Faiss to:{path}]")
531
+
532
+ def load_vectorstore(self, path: str) -> None:
533
+ """Load the vectorstore from disk."""
534
+ #self.logger.info("Loading Faiss...")
535
+ self.vectorstore = FAISS.load_local(folder_path=path,
536
+ embeddings=self.embeddings,
537
+ allow_dangerous_deserialization=True)
538
+ #self.logger.info(f"[Done! Loading Faiss from:{path}]")
539
+
540
+ def create_retrieval_chain(self, prompt: str = "default_rag_prompt", retriever: Optional[Any] = None) -> Any:
541
+ """Create a retrieval chain using the specified prompt and retriever."""
542
+ if not retriever:
543
+ retriever = self.get_basic_retriever()
544
+
545
+ combine_docs_chain = create_stuff_documents_chain(self.chatbot_llm, prompt=prompts[prompt])
546
+ return create_retrieval_chain(retriever, combine_docs_chain)
547
+
548
+ def run_retrieval_chain(self, chain: Any, query: str) -> Dict[str, Any]:
549
+ """Run a retrieval chain with the given query."""
550
+ return chain.invoke({"input": query})
551
+
552
+ def generate_3d_scatterplot(self, num_points=1000):
553
+ """
554
+ Generate a 3D scatter plot of the vector store content and log it to wandb.
555
+
556
+ :param num_points: Maximum number of points to plot (default: 1000)
557
+ :return: None (logs the plot to wandb)
558
+ """
559
+ all_docs = self.get_all_documents()
560
+
561
+ if not all_docs:
562
+ raise ValueError("No documents found in the vector store.")
563
+
564
+ # Extract vectors and metadata from documents
565
+ vectors = []
566
+ doc_ids = []
567
+ for doc in all_docs:
568
+ if hasattr(doc, 'embedding') and doc.embedding is not None:
569
+ vectors.append(doc.embedding)
570
+ else:
571
+ vectors.append(self.embeddings.embed_query(doc.page_content))
572
+ doc_ids.append(doc.metadata.get('id', 'Unknown'))
573
+
574
+ vectors = np.array(vectors)
575
+
576
+ # If we have more vectors than requested points, sample randomly
577
+ if len(vectors) > num_points:
578
+ indices = np.random.choice(len(vectors), num_points, replace=False)
579
+ vectors = vectors[indices]
580
+ doc_ids = [doc_ids[i] for i in indices]
581
+
582
+ # Perform PCA to reduce to 3 dimensions
583
+ pca = PCA(n_components=3)
584
+ vectors_3d = pca.fit_transform(vectors)
585
+
586
+
587
+ # Initialize wandb run
588
+ wandb.init(project="vector_store_visualization")
589
+
590
+ # Create the Plotly figure
591
+ fig = go.Figure(data=[go.Scatter3d(
592
+ x=vectors_3d[:, 0],
593
+ y=vectors_3d[:, 1],
594
+ z=vectors_3d[:, 2],
595
+ mode="markers",
596
+ marker=dict(
597
+ size=[28.666666666666668, 20.666666666666668, 15.333333333333334, 17.666666666666668, 19.0, 17.666666666666668, 26.0, 21.0, 21.666666666666668, 27.0, 21.666666666666668, 16.666666666666668, 27.0, 14.0, 29.666666666666668, 22.0, 16.0, 28.0, 27.0, 25.333333333333332],
598
+ color=[28.666666666666668, 20.666666666666668, 15.333333333333334, 17.666666666666668, 19.0, 17.666666666666668, 26.0, 21.0, 21.666666666666668, 27.0, 21.666666666666668, 16.666666666666668, 27.0, 14.0, 29.666666666666668, 22.0, 16.0, 28.0, 27.0, 25.333333333333332],
599
+ colorscale='Viridis',
600
+ showscale=True,
601
+ colorbar=dict(x=0),
602
+ cmin=14,
603
+ cmax=20
604
+ ),
605
+ text=self.doc_ids,
606
+ hoverinfo="text",
607
+ name="Document Vectors",
608
+ )])
609
+
610
+ # Update layout
611
+ fig.update_layout(
612
+ showlegend=True,
613
+ scene=dict(
614
+ xaxis_title="X Axis",
615
+ yaxis_title="Y Axis",
616
+ zaxis_title="Z Axis"
617
+ ),
618
+ width=1200,
619
+ height=1000,
620
+ )
621
+
622
+ # Log the plot to wandb
623
+ wandb.log({"3D Scatter Plot": fig})
624
+
625
+ # Finish the wandb run
626
+ wandb.finish()
627
+
628
+ def load_documents_folder(self, folder_path):
629
+ rp("[Loading documents from cloned repository]")
630
+ self.load_documents(folder_path)
631
+ self.document_count=len(self.documents)
632
+ rp(f"Splitting {self.document_count} documents")
633
+ self.split_documents()
634
+ self.chunk_count=len(self.documents)
635
+ rp(f"Adding {self.chunk_count} document chunks to vectorstore")
636
+ self.add_documents(self.documents)
637
+
638
+ def load_github_repo(self, repo_url: str) -> None:
639
+ """
640
+ Clone a GitHub repository to a temporary folder, load documents, and remove the folder.
641
+ """
642
+ split=repo_url.split('/')
643
+ repo_name = split.pop()
644
+ author_name = split.pop()
645
+ new_repo_path=os.path.join(self.repos_dir,f"{author_name}_{repo_name}")
646
+
647
+ if not os.path.exists(new_repo_path):
648
+ rp(f'Cloning repository {repo_url} to {new_repo_path}')
649
+ Repo.clone_from(repo_url, new_repo_path)
650
+
651
+ rp("Loading documents from cloned repository")
652
+ self.load_documents(new_repo_path)
653
+
654
+ rp(f"Splitting {len(self.documents)} documents into chunks")
655
+ self.split_documents()
656
+
657
+ rp(f"Adding {len(self.documents)} documents to vectorstore")
658
+ self.add_documents(self.documents)
659
+ self.save_vectorstore(self.storage_dir)
660
+ self.load_vectorstore(self.storage_dir)
661
+ rp("Temporary folder removed")
662
+ else:
663
+ rp(f"Repository {repo_url} already exists in {new_repo_path}")
664
+ self.load_vectorstore(self.storage_dir)
665
+
666
+
667
+ def get_all_documents(self):
668
+ """
669
+ Fetch all documents from the document store.
670
+ """
671
+ all_docs = []
672
+
673
+ # Number of vectors in the index
674
+ num_vectors = self.index.ntotal
675
+
676
+ # Assuming 'd' is the dimensionality of the vectors
677
+ d = self.index.d
678
+ #rp(f"D:{d}")
679
+ # Retrieve all vectors (this part is straightforward if you have access to the original vectors)
680
+ retrieved_vectors = np.empty((num_vectors, d), dtype='float32')
681
+ for i in range(num_vectors):
682
+ retrieved_vectors[i] = self.index.reconstruct(i)
683
+
684
+ # Assuming you have a way to get the document IDs
685
+ # In a real scenario, you would maintain a mapping of FAISS index positions to document IDs
686
+ # Example: you might have an attribute like 'self.doc_ids' which is a list of IDs
687
+ retrieved_ids = self.doc_ids[:num_vectors] # Ensure you have this attribute properly maintained
688
+ #rp(f"Retrieved ids{retrieved_ids}")
689
+ # Fetch documents using the retrieved IDs
690
+ retrieved_docs = [self.docstore.search(doc_id) for doc_id in retrieved_ids]
691
+
692
+ # Collect all documents
693
+ all_docs.extend(retrieved_docs)
694
+
695
+ #for doc_id, doc in zip(retrieved_ids, retrieved_docs):
696
+ #rp(f"ID: {doc_id}, Document.page_content: {doc.page_content}, Document.metadata: {doc.metadata}")
697
+
698
+ return all_docs
699
+
700
+
701
+ def test_chat(self,text,context='This is a chat with a nice Senior programmer.',history='Your Birth as fresh outof the box agent.'):
702
+
703
+ self.set_bot_role(context=context,history=history)
704
+
705
+ return self.chatbot_llm(text)
706
+ def chat(self, message: str) -> str:
707
+ """
708
+ Send a message to the HugChat bot and get a response.
709
+
710
+ :param message: The message to send to the bot
711
+ :return: The bot's response
712
+ """
713
+ if not self.chatbot_llm:
714
+ raise ValueError("HugChat bot not initialized. Provide email and password when creating AdvancedVectorStore.")
715
+ return self.chatbot_llm.chat(message)
716
+
717
+ def setup_speech_recognition(self):
718
+ """Set up speech recognition for the HugChat bot."""
719
+ if not self.chatbot_llm:
720
+ raise ValueError("HugChat bot not initialized. Provide email and password when creating AdvancedVectorStore.")
721
+ self.chatbot_llm.setup_speech_recognition()
722
+
723
+ def setup_tts(self, model_name="tts_models/en/ljspeech/fast_pitch"):
724
+ """Set up text-to-speech for the HugChat bot."""
725
+ if not self.chatbot_llm:
726
+ raise ValueError("HugChat bot not initialized. Provide email and password when creating AdvancedVectorStore.")
727
+ self.chatbot_llm.setup_tts(model_name)
728
+
729
+ def voice_chat(self):
730
+ """
731
+ Initiate a voice chat session with the HugChat bot.
732
+ """
733
+ if not self.chatbot_llm or not hasattr(self.chatbot_llm, 'recognizer') or not hasattr(self.chatbot_llm, 'tts'):
734
+ raise ValueError("Speech recognition and TTS not set up. Call setup_speech_recognition() and setup_tts() first.")
735
+
736
+ rp("Voice chat initiated. Speak your message (or say 'exit' to end the chat).")
737
+
738
+ while True:
739
+ with speech_recognition.Microphone() as source:
740
+ rp("Listening...")
741
+ audio = self.chatbot_llm.recognizer.listen(source)
742
+
743
+ try:
744
+ user_input = self.chatbot_llm.recognizer.recognize_google(audio)
745
+ rp(f"You said: {user_input}")
746
+
747
+ if user_input.lower() == 'exit':
748
+ rp("Ending voice chat.")
749
+ break
750
+
751
+ response = self.chat(user_input)
752
+ rp(f"Bot: {response}")
753
+
754
+ # Generate speech from the bot's response
755
+ speech_file = "bot_response.wav"
756
+ self.chatbot_llm.tts.tts_to_file(text=response, file_path=speech_file)
757
+ playsound(speech_file)
758
+ os.remove(speech_file) # Clean up the temporary audio file
759
+
760
+ except speech_recognition.UnknownValueError:
761
+ rp("Sorry, I couldn't understand that. Please try again.")
762
+ except speech_recognition.RequestError as e:
763
+ rp(f"Could not request results from the speech recognition service; {e}")
764
+
765
+ def rag_chat(self, query: str, prompt: str = "default_rag_prompt") -> str:
766
+ """
767
+ Perform a RAG (Retrieval-Augmented Generation) chat using the vectorstore and HugChat bot.
768
+
769
+ :param query: The user's query
770
+ :param prompt: The prompt to use for the retrieval chain (default: "default_rag_prompt")
771
+ :return: The bot's response
772
+ """
773
+ if not self.vectorstore:
774
+ raise ValueError("Vectorstore not initialized. Call create_vectorstore() first.")
775
+
776
+ retriever = self.get_basic_retriever()
777
+ chain = self.create_retrieval_chain(prompt, retriever)
778
+ result = self.run_retrieval_chain(chain, query)
779
+ return result['answer']
780
+
781
+ def search_web(self):
782
+ search_query = input("Enter your web search query: ")
783
+ future_date = "July 12, 2024"
784
+ search_url = f"https://www.google.com/search?q={search_query}+before:{future_date}"
785
+ webbrowser.open(search_url)
786
+ rp(f"Search results for '{search_query}' on {future_date}:")
787
+ rp("=" * 50)
788
+ rp(search_url)
789
+ rp("=" * 50)
790
+
791
+ def advanced_rag_chatbot(self):
792
+ rp("Welcome to the Advanced RAG Chatbot!")
793
+ rp("This chatbot uses a compressed retriever and integrates all components of the vector store.")
794
+ rp("Type 'exit' to end the conversation.")
795
+
796
+ # Ensure the vectorstore is initialized
797
+ if self.vectorstore is None:
798
+ rp("Initializing vector store...")
799
+ self.vectorstore, self.docstore, self.index = self.create_indexed_vectorstore(self.chunk_size)
800
+
801
+ # Create a compressed retriever
802
+ # compressed_retriever = self.get_contextual_compression_retriever(k=5, similarity_threshold=0.75)
803
+ mode='basic'
804
+ k=5
805
+ similarity_threshold=0.75
806
+ retriever = self.set_current_retriever(mode=mode, k=k, sim_rate=similarity_threshold)
807
+ #basic_retriever = self.get_basic_retriever(k=4)
808
+ # Initialize conversation history
809
+ conversation_history = []
810
+
811
+ while True:
812
+ user_input = input("\nYou: ").strip()
813
+ if user_input.lower() == 'exit':
814
+ rp("Thank you for using the Advanced RAG Chatbot. Goodbye!")
815
+ break
816
+
817
+ rp("# Step 1: Retrieve relevant documents")
818
+ retrieved_docs = self.get_basic_retriever(k=4).get_relevant_documents(user_input)
819
+
820
+ rp("# Step 2: Prepare context from retrieved documents")
821
+ context = "\n".join([doc.page_content for doc in retrieved_docs])
822
+
823
+ rp("# Step 3: Prepare the prompt")
824
+ #prompt = prompts['default_rag_prompt']
825
+ self.set_bot_role(context=context, history=' '.join(conversation_history[-5:]))
826
+ rp("# Step 4: Generate response using the chatbot")
827
+ response = self.chatbot_llm(f"User:{user_input}\n")
828
+
829
+ rp(f"Chatbot: {response}")
830
+
831
+ # Update conversation history
832
+ conversation_history.append(f"User: {user_input}")
833
+ conversation_history.append(f"Chatbot: {response}")
834
+
835
+ # Step 5: Demonstrate use of individual components
836
+ rp("\nAdditional Information:")
837
+ rp(f'- Number of documents in docstore: {len(self.docstore.search("* *"))}')
838
+ rp(f"- Number of vectors in index: {self.index.ntotal}")
839
+
840
+ # Demonstrate direct use of vectorstore for similarity search
841
+ similar_docs = self.vectorstore.similarity_search(user_input, k=1)
842
+ similar_docs = self.vectorstore.similarity_search_with_relevance_scores(user_input,k=1)
843
+ if similar_docs:
844
+ rp(type(similar_docs))
845
+ rp(f"-[Most similar document: [{similar_docs[0].metadata.get('source', 'Unknown')}]]-")
846
+
847
+ # Generate a 3D scatter plot of the vectorstore content
848
+ #avs.generate_3d_scatterplot_wandb()
849
+ avs.generate_3d_scatterplot()
850
+
851
+ # Optional: Add user feedback loop
852
+ feedback = input("Was this response helpful? (yes/no): ").strip().lower()
853
+ if feedback == 'no':
854
+ rp("I'm sorry the response wasn't helpful. Let me try to improve it.")
855
+ # Here you could implement logic to refine the response or adjust the retrieval process
856
+ with open(file="./feedback_NO.txt",mode="a+")as f:
857
+ f.write(f"chat_feedback_NO\nChatHistory--->{' '.join(conversation_history[-10:])}")
858
+
859
+
860
+
861
+ # Example usage:
862
+ if __name__ == "__main__":
863
+ email = os.getenv("EMAIL")
864
+ password = os.getenv("PASSWD")
865
+ github_token = os.getenv("GITHUB_TOKEN")
866
+
867
+ # Initialize AdvancedVectorStore with HugChat bot
868
+ avs = AdvancedVectorStore(email=email, password=password)
869
+
870
+ # Create the indexed vectorstore
871
+ #avs.create_indexed_vectorstore()
872
+
873
+ # Clone a GitHub repository and load its contents
874
+
875
+ # avs.load_documents_folder("/nr_ywo/coding/voice_chat_rag_web/venv/lib/python3.10/site-packages/huggingface_hub/inference")
876
+ avs.load_documents_folder("/nr_ywo/coding/voice_chat_rag_web/venv/lib/python3.10/site-packages/hugchat")
877
+ avs.load_documents_folder("/nr_ywo/coding/voice_chat_rag_web/venv/lib/python3.10/site-packages/langchain/agents")
878
+ avs.load_documents_folder("/nr_ywo/coding/voice_chat_rag_web/venv/lib/python3.10/site-packages/langchain_experimental/autonomous_agents")
879
+
880
+ #avs.chatbot_llm.load_documents("/nr_ywo/coding/voice_chat_rag_web/test_input")
881
+ # avs.load_github_repo("https://github.com/bxck75/voice_chat_rag_web")
882
+ avs.save_vectorstore(path=avs.storage_dir)
883
+ avs.load_vectorstore(path=avs.storage_dir)
884
+ # rp document and chunk counts
885
+ #rp(f"Total documents: {avs.chunk_count / avs.chunk_size}")
886
+ #rp(f"Total chunks: {avs.chunk_count}")
887
+ #avs.logger.info(avs.chatbot_llm.current_model)
888
+ #avs.logger.info(avs.chatbot_llm.current_system_prompt)
889
+
890
+ retriever=avs.set_current_retriever(mode='basic',k=4)
891
+ comptriever=avs.set_current_retriever(mode='compression',k=4,sim_rate=0.87)
892
+ timetriever=avs.set_current_retriever(mode='time',k=1)
893
+
894
+ q="Demonstrate your knowledge of developing advanced AI scripts in OOP python. try to come up with cutting edge ideas"
895
+ rel_docs=retriever.invoke(input=q)
896
+ #okrp(f"[Raw Knowledge Retrieved:{rel_docs}]")
897
+ # Start the advanced RAG chatbot
898
+ avs.advanced_rag_chatbot()
899
+
900
+ # Perform a RAG chat
901
+ #rag_response = avs.rag_chat(query="Explain the concept of neural networks.")
902
+ #rp("RAG chat response:", rag_response)
903
+
904
+ # Set up speech recognition and TTS for voice chat
905
+ #avs.setup_speech_recognition()
906
+ #avs.setup_tts()
907
+
908
+ # Start a voice chat session
909
+ #avs.voice_chat()
910
+ """
911
+ # Using different retrievers
912
+ multi_query_retriever = avs.get_multi_query_retriever()
913
+ results = avs.search("What is deep learning?", mode="multi_query")
914
+ rp("Multi-query retriever results:", results)
915
+
916
+ self_query_retriever = avs.get_self_query_retriever()
917
+ results = avs.search("Find documents about reinforcement learning", self_query_retriever)
918
+ rp("Self-query retriever results:", results)
919
+
920
+ contextual_compression_retriever = avs.get_contextual_compression_retriever()
921
+ results = avs.search("Explain the difference between supervised and unsupervised learning", contextual_compression_retriever)
922
+ rp("Contextual compression retriever results:", results)
923
+
924
+ """
925
+ """ # Perform a basic search
926
+ k = 4
927
+ similarity_threshold = 0.78
928
+ q = "What is machine learning?"
929
+
930
+ basic_results = avs.search(q, mode='basic', k=k)
931
+ rp("Basic search results:", basic_results)
932
+ rp("self_query search results:", self_query_results)
933
+ rp("multi_query search results:", multi_results)
934
+ rp("Compressed search results:", commpressed_results)
935
+ """
936
+
937
+
938
+ """ This advanced example demonstrates:
939
+
940
+ Use of the compressed retriever for efficient document retrieval.
941
+ Integration of conversation history for context-aware responses.
942
+ Direct use of the vectorstore for similarity search.
943
+ Access to the docstore and index for additional information.
944
+ A feedback loop to potentially improve responses (though the improvement logic is not implemented in this example).
945
+
946
+ This chatbot loop showcases how all components of the system can work together to provide informative responses based on the loaded documents. It also demonstrates how you can access and use individual components (docstore, index, vectorstore) for additional functionality or information.
947
+ To further optimize this system, you could consider:
948
+
949
+ Implementing caching mechanisms to speed up repeated queries.
950
+ Adding more sophisticated feedback handling to improve retrieval and response generation over time.
951
+ Implementing dynamic index updates if new information becomes available during the chat session.
952
+ Adding options for users to see the sources of information or request more details on specific topics.
953
+
954
+ This example provides a solid foundation that you can further customize and expand based on your specific needs and use cases. """
VoiceProfile.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class VoiceProfile:
2
+ def __init__(self, name, language, pitch, speaking_rate):
3
+ self.name = name
4
+ self.language = language
5
+ self.pitch = pitch
6
+ self.speaking_rate = speaking_rate
7
+
8
+ def to_dict(self):
9
+ return {
10
+ 'name': self.name,
11
+ 'language': self.language,
12
+ 'pitch': self.pitch,
13
+ 'speaking_rate': self.speaking_rate
14
+ }
15
+
16
+ @classmethod
17
+ def from_dict(cls, profile_dict):
18
+ return cls(
19
+ profile_dict['name'],
20
+ profile_dict['language'],
21
+ profile_dict['pitch'],
22
+ profile_dict['speaking_rate']
23
+ )
VoiceProfileManager.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+
4
+ class VoiceProfileManager:
5
+ def __init__(self, filename="voice_profiles.json"):
6
+ self.filename = filename
7
+ self.profiles = []
8
+
9
+ def load_profiles(self):
10
+ try:
11
+ with open(self.filename, 'r') as file:
12
+ profiles_data = json.load(file)
13
+ self.profiles = [VoiceProfile.from_dict(profile) for profile in profiles_data]
14
+ except FileNotFoundError:
15
+ print(f"File '{self.filename}' not found. Starting with an empty profile list.")
16
+ self.profiles = []
17
+
18
+ def save_profiles(self):
19
+ profiles_data = [profile.to_dict() for profile in self.profiles]
20
+ with open(self.filename, 'w') as file:
21
+ json.dump(profiles_data, file, indent=4)
22
+ print(f"Profiles saved to '{self.filename}'.")
23
+
24
+ def add_profile(self, profile):
25
+ self.profiles.append(profile)
26
+
27
+ def generate_random_profile(self):
28
+ name = f"Profile-{len(self.profiles) + 1}"
29
+ languages = ["en-US", "en-GB", "fr-FR", "es-ES"] # Example languages
30
+ language = random.choice(languages)
31
+ pitch = round(random.uniform(0.8, 1.2), 2)
32
+ speaking_rate = round(random.uniform(0.7, 1.3), 2)
33
+ new_profile = VoiceProfile(name, language, pitch, speaking_rate)
34
+ self.add_profile(new_profile)
35
+ return new_profile
36
+
37
+ def list_profiles(self):
38
+ if not self.profiles:
39
+ print("No profiles found.")
40
+ else:
41
+ for idx, profile in enumerate(self.profiles, start=1):
42
+ print(f"Profile {idx}: {profile.name} - Language: {profile.language}, Pitch: {profile.pitch}, Speaking Rate: {profile.speaking_rate}")
43
+
44
+ # Example usage:
45
+ if __name__ == "__main__":
46
+ manager = VoiceProfileManager()
47
+ manager.load_profiles()
48
+
49
+ while True:
50
+ print("\nVoice Profile Manager Menu:")
51
+ print("1. Generate Random Profile")
52
+ print("2. List Profiles")
53
+ print("3. Save Profiles")
54
+ print("4. Exit")
55
+
56
+ choice = input("Enter your choice: ")
57
+
58
+ if choice == "1":
59
+ new_profile = manager.generate_random_profile()
60
+ print(f"Generated new profile: {new_profile.name}")
61
+
62
+ elif choice == "2":
63
+ manager.list_profiles()
64
+
65
+ elif choice == "3":
66
+ manager.save_profiles()
67
+
68
+ elif choice == "4":
69
+ print("Exiting program.")
70
+ break
71
+
72
+ else:
73
+ print("Invalid choice. Please enter a number from the menu.")
llm_chatbot.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import getpass
3
+ import faiss
4
+ import numpy as np
5
+ import io,re
6
+ import faiss
7
+ import warnings
8
+ import requests
9
+ from hugchat import hugchat
10
+ from rich import print as rp
11
+ from hugchat.login import Login
12
+ from dotenv import load_dotenv,find_dotenv
13
+ import speech_recognition
14
+ from TTS.api import TTS
15
+ from git import Repo
16
+ import time
17
+ from playsound import playsound
18
+ from langchain import hub
19
+ from langchain_core.documents import Document
20
+ from langchain.chains.combine_documents import create_stuff_documents_chain
21
+ from langchain.chains import create_retrieval_chain
22
+ from langchain_huggingface import HuggingFaceEmbeddings
23
+ from langchain_community.vectorstores import FAISS
24
+ from langchain.retrievers import ContextualCompressionRetriever
25
+ from langchain_community.document_loaders import DirectoryLoader
26
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
27
+ from langchain.retrievers.document_compressors import DocumentCompressorPipeline
28
+ from langchain_community.document_transformers import EmbeddingsRedundantFilter
29
+ from langchain_text_splitters import CharacterTextSplitter
30
+ from langchain.retrievers.document_compressors import EmbeddingsFilter
31
+
32
+ from system_prompts import (default_rag_prompt,story_teller_prompt,todo_parser_prompt,
33
+ code_generator_prompt,software_tester_prompt,script_debugger_prompt,iteration_controller_prompt,copilot_prompt)
34
+ prompts={'default_rag_prompt':default_rag_prompt,
35
+ 'story_teller_prompt':story_teller_prompt,
36
+ 'todo_parser_prompt':todo_parser_prompt,
37
+ 'code_generator_prompt':code_generator_prompt,
38
+ 'software_tester_prompt':software_tester_prompt,
39
+ 'script_debugger_prompt':script_debugger_prompt,
40
+ 'iteration_controller_prompt':iteration_controller_prompt,
41
+ 'copilot_prompt':copilot_prompt
42
+ }
43
+
44
+ load_dotenv(find_dotenv())
45
+ warnings.filterwarnings("ignore")
46
+ os.environ["USER_AGENT"] = os.getenv("USER_AGENT")
47
+
48
+ class LLMChatBot:
49
+ def __init__(self, email, password, cookie_path_dir='./cookies/',default_llm=1):
50
+ self.email = email
51
+ self.password = password
52
+ self.current_model = 1
53
+ self.cookie_path_dir = cookie_path_dir
54
+ self.cookies = self.login()
55
+ self.chatbot = hugchat.ChatBot(cookies=self.cookies.get_dict(),
56
+ default_llm = default_llm, #CohereForAI/c4ai-command-r-plus
57
+ )
58
+ self.repo_url='https://github.com/langchain-ai/langchain'
59
+ self.default_system_prompt = prompts['default_rag_prompt']
60
+ self.conv_id = None
61
+ self.latest_splitter=None
62
+ self.setup_folders()
63
+ self.embeddings=HuggingFaceEmbeddings(
64
+ model_name="all-MiniLM-L6-v2",
65
+ model_kwargs={'device': 'cpu'},
66
+ encode_kwargs={'normalize_embeddings': True}
67
+ )
68
+ self.create_vectorstore_from_github()
69
+
70
+ self.setup_retriever()
71
+ self.setup_tts()
72
+ self.setup_speech_recognition()
73
+
74
+ def login(self):
75
+ rp("Attempting to log in...")
76
+ sign = Login(self.email, self.password)
77
+ try:
78
+ cookies = sign.login(cookie_dir_path=self.cookie_path_dir, save_cookies=True)
79
+ rp("Login successful!")
80
+ return cookies
81
+ except Exception as e:
82
+ rp(f"Login failed: {e}")
83
+ rp("Attempting manual login with requests...")
84
+ self.manual_login()
85
+ raise
86
+
87
+ def manual_login(self):
88
+ login_url = "https://huggingface.co/login"
89
+ session = requests.Session()
90
+ response = session.get(login_url)
91
+ rp("Response Cookies:", response.cookies)
92
+ rp("Response Content:", response.content.decode())
93
+
94
+ csrf_token = response.cookies.get('csrf_token')
95
+ if not csrf_token:
96
+ rp("CSRF token not found in cookies.")
97
+ return
98
+
99
+ login_data = {
100
+ 'email': self.email,
101
+ 'password': self.password,
102
+ 'csrf_token': csrf_token
103
+ }
104
+
105
+ response = session.post(login_url, data=login_data)
106
+ if response.ok:
107
+ rp("Manual login successful!")
108
+ else:
109
+ rp("Manual login failed!")
110
+ def setup_speech_recognition(self):
111
+ self.recognizer = speech_recognition.Recognizer()
112
+
113
+ def setup_folders(self):
114
+ self.dirs=["test_input"]
115
+ for d in self.dirs:
116
+ os.makedirs(d, exist_ok=True)
117
+
118
+ def setup_tts(self, model_name="tts_models/en/ljspeech/fast_pitch"):
119
+ self.tts = TTS(model_name=model_name)
120
+
121
+ def __call__(self, text, system_prompt=""): # llama 3
122
+ self.conv_id = self.chatbot.new_conversation(system_prompt=system_prompt, modelIndex=self.current_model, switch_to=True)
123
+ return self.send_message(text)
124
+
125
+ def send_message(self, message):
126
+ message_result = self.chatbot.chat(message)
127
+ return message_result.wait_until_done()
128
+
129
+ def stream_response(self, message):
130
+ for resp in self.chatbot.query(message, stream=True):
131
+ rp(resp)
132
+
133
+ def web_search(self, query):
134
+ query_result = self.chatbot.query(query, web_search=True)
135
+ results = []
136
+ for source in query_result.web_search_sources:
137
+ results.append({
138
+ 'link': source.link,
139
+ 'title': source.title,
140
+ 'hostname': source.hostname
141
+ })
142
+ return results
143
+
144
+ def create_new_conversation(self,switch_to=True, system_prompt = ""):
145
+ self.chatbot.new_conversation(switch_to=switch_to, modelIndex = self.current_model, system_prompt = system_prompt)
146
+
147
+ def get_remote_conversations(self):
148
+ return self.chatbot.get_remote_conversations(replace_conversation_list=True)
149
+
150
+ def get_local_conversations(self):
151
+ return self.chatbot.get_conversation_list()
152
+
153
+ def get_available_models(self):
154
+ return self.chatbot.get_available_llm_models()
155
+
156
+ def switch_model(self, index):
157
+ self.chatbot.switch_llm(index)
158
+
159
+ def switch_conversation(self, id):
160
+ self.conv_id = id
161
+ self.chatbot.change_conversation(self.conv_id)
162
+
163
+ def get_assistants(self):
164
+ return self.chatbot.get_assistant_list_by_page(1)
165
+
166
+ def switch_role(self,system_prompt):
167
+ self.chatbot.delete_all_conversations()
168
+ return self.chatbot.new_conversation(switch_to=True, system_prompt=self.default_system_prompt)
169
+
170
+ def listen_for_speech(self):
171
+ with speech_recognition.Microphone() as source:
172
+ print("Listening...")
173
+ audio = self.recognizer.listen(source)
174
+
175
+ try:
176
+ text = self.recognizer.recognize_google(audio)
177
+ print(f"You said: {text}")
178
+ return text
179
+ except speech_recognition.UnknownValueError:
180
+ print("Sorry, I couldn't understand that.")
181
+ return None
182
+ except speech_recognition.RequestError as e:
183
+ print(f"Could not request results from Google Speech Recognition service; {e}")
184
+ return None
185
+
186
+ def optimized_tts(self, text: str, output_file: str = "output.wav", speaking_rate: float = 3) -> str:
187
+ start_time = time.time()
188
+
189
+ self.tts.tts_to_file(
190
+ text=text,
191
+ emotion='scared',
192
+ file_path=output_file,
193
+ speaker=self.tts.speakers[0] if self.tts.speakers else None,
194
+ speaker_wav="tortoise-tts/examples/favorites/emma_stone_courage.mp3",
195
+ language=self.tts.languages[0] if self.tts.languages else None,
196
+ speed=speaking_rate,
197
+ split_sentences=True
198
+ )
199
+
200
+ end_time = time.time()
201
+ print(f"TTS generation took {end_time - start_time:.2f} seconds")
202
+ return output_file
203
+
204
+ @staticmethod
205
+ def Play(file_path):
206
+ playsound(file_path)
207
+
208
+ def add_documents_folder(self, folder_path):
209
+ for root, _, files in os.walk(folder_path):
210
+ for file in files:
211
+ file_path = os.path.join(root, file)
212
+ self.add_document(file_path)
213
+
214
+ def add_document(self, file_path):
215
+ with open(file_path, 'r', encoding='utf-8') as file:
216
+ content = file.read()
217
+ document = Document(page_content=content)
218
+ self.vector_store.add_documents([document])
219
+
220
+ def add_document_from_url(self, url):
221
+ response = requests.get(url)
222
+ if response.status_code == 200:
223
+ content = response.text
224
+ document = Document(page_content=content)
225
+ self.vector_store.add_documents([document])
226
+ else:
227
+ print(f"Failed to fetch URL content: {response.status_code}")
228
+
229
+ def delete_document(self, document):
230
+ if document in self.vector_store:
231
+ self.vector_store.delete_document(document)
232
+ print(f"Deleted document: {document}")
233
+ else:
234
+ print(f"Document not found: {document}")
235
+
236
+ def _add_to_vector_store(self, name, content):
237
+ document = Document(page_content=content)
238
+ self.vector_store.add_documents([document])
239
+ print(f"Added document to vector store: {name}")
240
+ # Example of updating the vectorizer (you might need to adjust based on your actual implementation)
241
+ self.vectorizer.fit_transform(self.vector_store.get_all_documents())
242
+
243
+ def clone_github_repo(self, repo_url, local_path='./repo'):
244
+ if os.path.exists(local_path):
245
+ print("Repository already cloned.")
246
+ return local_path
247
+ Repo.clone_from(repo_url, local_path)
248
+ return local_path
249
+
250
+ def load_documents_from_github(self, repo_url, file_types=['*.py', '*.md', '*.txt', '*.html']):
251
+ local_repo_path = self.clone_github_repo(repo_url)
252
+ loader = DirectoryLoader(path=local_repo_path, glob=f"**/{{{','.join(file_types)}}}", show_progress=True, recursive=True)
253
+ return loader.load()
254
+
255
+ def split_documents(self, documents: list,chunk_s=512,chunk_o=0):
256
+ split_docs = []
257
+ splitter=None
258
+ for doc in documents:
259
+ ext = os.path.splitext(getattr(doc, 'source', '') or getattr(doc, 'filename', ''))[1].lower()
260
+ if ext == '.py':
261
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=chunk_s, chunk_overlap=chunk_o)
262
+ elif ext in ['.md', '.markdown']:
263
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_s, chunk_overlap=chunk_o)
264
+ elif ext in ['.html', '.htm']:
265
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.HTML, chunk_size=chunk_s, chunk_overlap=chunk_o)
266
+ else:
267
+ splitter = CharacterTextSplitter(chunk_size=chunk_s, chunk_overlap=chunk_o, add_start_index=True)
268
+
269
+ split_docs.extend(splitter.split_documents([doc]))
270
+ return split_docs,splitter
271
+
272
+
273
+ def setup_retriever(self, k=5, similarity_threshold=0.76):
274
+ self.retriever = self.vectorstore.as_retriever(k=k)
275
+ splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
276
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.embeddings)
277
+ relevant_filter = EmbeddingsFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
278
+ pipeline_compressor = DocumentCompressorPipeline(
279
+ transformers=[splitter, redundant_filter, relevant_filter]
280
+ )
281
+ self.compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=self.retriever)
282
+
283
+ def create_retrieval_chain(self):
284
+ rag_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
285
+ combine_docs_chain = create_stuff_documents_chain(self.bot, rag_prompt)
286
+ self.high_retrieval_chain = create_retrieval_chain(self.compression_retriever, combine_docs_chain)
287
+ self.low_retrieval_chain = create_retrieval_chain(self.retriever, combine_docs_chain)
288
+
289
+ def create_vectorstore_from_github(self):
290
+ documents = self.load_documents_from_github(self.repo_url)
291
+ split_docs,splitter = self.split_documents(documents,512,0)
292
+ self.latest_splitter=splitter
293
+ self.vectorstore = FAISS.from_documents(split_docs, self.embeddings)
294
+ print(f"Vectorstore created with {len(split_docs)} documents.")
295
+
296
+ def update_vectorstore(self, new_documents):
297
+ split_docs,splitter = self.split_documents(new_documents)
298
+ self.latest_splitter=splitter
299
+ self.vectorstore.add_documents(split_docs)
300
+ print(f"Vectorstore updated with {len(split_docs)} new documents.")
301
+
302
+
303
+ def retrieve_with_chain(self, query, mode='high'):
304
+ if mode == 'high':
305
+ return self.high_retrieval_chain.invoke({"input": query})
306
+ else:
307
+ return self.low_retrieval_chain.invoke({"input": query})
308
+ if __name__ == '__main__':
309
+ EMAIL = os.getenv("EMAIL")
310
+ PASSWD = os.getenv("PASSWD")
311
+ model=1
312
+ chatbot = LLMChatBot(EMAIL, PASSWD, default_llm=model)
313
+ chatbot.create_new_conversation(system_prompt=chatbot.default_system_prompt, switch_to=True)
314
+ #all_models=chatbot.get_available_models()
315
+ #rp(all_models[chatbot.current_model].name)
316
+ results=chatbot("""Tel me a short crafting survival Scify story of K.U.T.H.O.E.R """)
317
+ audio_path = chatbot.optimized_tts(str(results))
318
+ chatbot.Play(audio_path)
319
+ rp(results)
profiler.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tkinter as tk
2
+ from tkinter import messagebox
3
+ from tkinter import ttk
4
+ import json,subprocess
5
+ import random
6
+ from TTS.api import TTS
7
+ class VoiceProfile:
8
+ def __init__(self, name, voice):
9
+ self.name = name
10
+ self.voice = voice
11
+
12
+ def to_dict(self):
13
+ return {
14
+ 'name': self.name,
15
+ 'voice': self.voice
16
+ }
17
+
18
+ @classmethod
19
+ def from_dict(cls, profile_dict):
20
+ return cls(
21
+ profile_dict['name'],
22
+ profile_dict['voice']
23
+ )
24
+
25
+ def configure_tts(self):
26
+ # Set Festival voice
27
+ subprocess.run(["festival", "--tts", "(voice_" + self.voice + ")"])
28
+
29
+ class VoiceProfileManager:
30
+ def __init__(self, filename="voice_profiles.json"):
31
+ self.filename = filename
32
+ self.profiles = []
33
+ self.load_profiles()
34
+
35
+ def load_profiles(self):
36
+ try:
37
+ with open(self.filename, 'r') as file:
38
+ profiles_data = json.load(file)
39
+ self.profiles = [VoiceProfile.from_dict(profile) for profile in profiles_data]
40
+ except FileNotFoundError:
41
+ print(f"File '{self.filename}' not found. Starting with an empty profile list.")
42
+ self.profiles = []
43
+
44
+ def save_profiles(self):
45
+ profiles_data = [profile.to_dict() for profile in self.profiles]
46
+ with open(self.filename, 'w') as file:
47
+ json.dump(profiles_data, file, indent=4)
48
+ print(f"Profiles saved to '{self.filename}'.")
49
+
50
+ def add_profile(self, profile):
51
+ self.profiles.append(profile)
52
+
53
+ def generate_random_profile(self):
54
+ name = f"Profile-{len(self.profiles) + 1}"
55
+ voices = ["cmu_us_slt", "cmu_us_awb", "cmu_us_rms", "cmu_us_bdl"] # Example Festival voices
56
+ voice = random.choice(voices)
57
+ new_profile = VoiceProfile(name, voice)
58
+ self.add_profile(new_profile)
59
+ return new_profile
60
+
61
+ def list_profiles(self):
62
+ if not self.profiles:
63
+ return "No profiles found."
64
+ else:
65
+ profiles_list = []
66
+ for idx, profile in enumerate(self.profiles, start=1):
67
+ profiles_list.append(f"Profile {idx}: {profile.name} - Voice: {profile.voice}")
68
+ return profiles_list
69
+
70
+ def get_profile_by_name(self, profile_name):
71
+ for profile in self.profiles:
72
+ if profile.name == profile_name:
73
+ return profile
74
+ return None
75
+
76
+
77
+ class VoiceProfileTool:
78
+ def __init__(self, root):
79
+ self.root = root
80
+ self.root.title("Voice Profile Manager")
81
+
82
+ self.profile_manager = VoiceProfileManager()
83
+
84
+ self.create_widgets()
85
+
86
+ def create_widgets(self):
87
+ # Frame for profile list and operations
88
+ profile_frame = ttk.LabelFrame(self.root, text="Voice Profiles")
89
+ profile_frame.grid(row=0, column=0, padx=10, pady=10, sticky=tk.W+tk.E+tk.N+tk.S)
90
+
91
+ # Listbox to display profiles
92
+ self.profiles_listbox = tk.Listbox(profile_frame, width=50, height=10)
93
+ self.profiles_listbox.grid(row=0, column=0, padx=10, pady=10, sticky=tk.W+tk.E+tk.N+tk.S)
94
+
95
+ # Scrollbar for the listbox
96
+ scrollbar = ttk.Scrollbar(profile_frame, orient=tk.VERTICAL, command=self.profiles_listbox.yview)
97
+ scrollbar.grid(row=0, column=1, pady=10, sticky=tk.N+tk.S)
98
+ self.profiles_listbox.config(yscrollcommand=scrollbar.set)
99
+
100
+ # Button to generate random profile
101
+ generate_btn = ttk.Button(profile_frame, text="Generate Random Profile", command=self.generate_random_profile)
102
+ generate_btn.grid(row=1, column=0, padx=10, pady=5, sticky=tk.W+tk.E)
103
+
104
+ # Button to refresh profile list
105
+ refresh_btn = ttk.Button(profile_frame, text="Refresh List", command=self.refresh_profiles_list)
106
+ refresh_btn.grid(row=1, column=1, padx=10, pady=5, sticky=tk.W+tk.E)
107
+
108
+ # Frame for TTS operations
109
+ tts_frame = ttk.LabelFrame(self.root, text="Text-to-Speech (TTS)")
110
+ tts_frame.grid(row=1, column=0, padx=10, pady=10, sticky=tk.W+tk.E+tk.N+tk.S)
111
+
112
+ # Text entry for TTS input
113
+ ttk.Label(tts_frame, text="Enter text to speak:").grid(row=0, column=0, padx=10, pady=5, sticky=tk.W)
114
+ self.tts_text_entry = ttk.Entry(tts_frame, width=50)
115
+ self.tts_text_entry.grid(row=0, column=1, padx=10, pady=5, sticky=tk.W+tk.E)
116
+
117
+ # Combobox to select profile for TTS
118
+ ttk.Label(tts_frame, text="Select profile:").grid(row=1, column=0, padx=10, pady=5, sticky=tk.W)
119
+ self.profile_combobox = ttk.Combobox(tts_frame, width=48, state="readonly")
120
+ self.profile_combobox.grid(row=1, column=1, padx=10, pady=5, sticky=tk.W+tk.E)
121
+
122
+ # Button to speak text
123
+ speak_btn = ttk.Button(tts_frame, text="Speak", command=self.speak_text)
124
+ speak_btn.grid(row=2, column=1, padx=10, pady=10, sticky=tk.W+tk.E)
125
+
126
+ # Populate initial profiles list
127
+ self.refresh_profiles_list()
128
+
129
+ def refresh_profiles_list(self):
130
+ # Clear current listbox and combobox
131
+ self.profiles_listbox.delete(0, tk.END)
132
+ self.profile_combobox['values'] = []
133
+
134
+ # Load profiles from manager
135
+ profiles = self.profile_manager.list_profiles()
136
+ if profiles:
137
+ for profile in profiles:
138
+ self.profiles_listbox.insert(tk.END, profile)
139
+ self.profile_combobox['values'] += (profile.split(':')[0],) # Add profile name to combobox options
140
+
141
+ def generate_random_profile(self):
142
+ new_profile = self.profile_manager.generate_random_profile()
143
+ messagebox.showinfo("Profile Generated", f"Generated new profile: {new_profile.name}")
144
+ self.refresh_profiles_list()
145
+
146
+ def speak_text(self):
147
+ text_to_speak = self.tts_text_entry.get().strip()
148
+ selected_profile_name = self.profile_combobox.get().strip()
149
+
150
+ if not text_to_speak:
151
+ messagebox.showwarning("Input Required", "Please enter text to speak.")
152
+ return
153
+
154
+ if not selected_profile_name:
155
+ messagebox.showwarning("Profile Required", "Please select a profile.")
156
+ return
157
+
158
+ profile = self.profile_manager.get_profile_by_name(selected_profile_name)
159
+ if profile:
160
+ profile.configure_tts()
161
+ subprocess.Popen(["festival", "--tts"], stdin=subprocess.PIPE).communicate(bytes(text_to_speak, 'utf-8'))
162
+ messagebox.showinfo("Text-to-Speech", f"Text: {text_to_speak}\nProfile: {profile.name}\nVoice: {profile.voice}")
163
+ else:
164
+ messagebox.showerror("Profile Not Found", f"Profile '{selected_profile_name}' not found.")
165
+
166
+ # Main program
167
+ if __name__ == "__main__":
168
+ root = tk.Tk()
169
+ app = VoiceProfileTool(root)
170
+ root.mainloop()
171
+ """ Explanation:
172
+ Integration with piper_tts: This example uses the TTS class from piper_tts for text-to-speech synthesis. The configure_tts method in VoiceProfile class is used to set parameters (language, pitch, speaking_rate) on the TTS engine before synthesizing speech.
173
+
174
+ Tkinter GUI: The GUI interface (VoiceProfileTool class) is built using Tkinter widgets (Listbox, Entry, Combobox, Button, etc.) to manage voice profiles (list, generate random profile) and perform TTS (enter text, select profile, speak).
175
+
176
+ Profile Management: VoiceProfileManager handles loading/saving profiles from/to JSON file, generating random profiles, listing profiles, and retrieving profiles by name.
177
+
178
+ Handling TTS Output: After synthesizing speech with piper_tts, the example shows a message box with details about the synthesized text and the selected profile.
179
+ """
requirements.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web Frameworks and APIs
2
+ Flask==3.0.3
3
+ FastAPI==0.111.0
4
+
5
+ # Data Science and Machine Learning
6
+ numpy==1.22.0
7
+ pandas==1.5.3
8
+ scikit-learn==1.5.1
9
+ matplotlib==3.8.4
10
+ torch==2.3.1+cpupip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
11
+
12
+ # Natural Language Processing and AI
13
+ spacy==3.7.5
14
+ transformers==4.42.3
15
+ huggingface-hub==0.23.4
16
+ hugchat==0.4.9
17
+
18
+ # Langchain Ecosystem
19
+ langchain==0.2.7
20
+ langchain-community==0.2.7
21
+ langchain-core==0.2.12
22
+ langchain-experimental==0.0.62
23
+ langchain-huggingface==0.0.3
24
+ langchain-text-splitters==0.2.2
25
+ langchainhub==0.1.20
26
+
27
+ # Other AI and Language Models
28
+ openai==1.35.13
29
+
30
+ # Utilities and Tools
31
+ requests==2.32.3
32
+ beautifulsoup4==4.12.3
33
+ Pillow==10.4.0
34
+ tqdm==4.66.4
35
+
36
+ # Data Serialization and Parsing
37
+ pyyaml==6.0.1
38
+ jsonschema==4.23.0
39
+
40
+ # Async Programming
41
+ aiohttp==3.9.5
42
+ asyncio # Built-in, no version needed
43
+
44
+ # Testing
45
+ pytest==8.2.2
46
+
47
+ # Web Scraping
48
+ playwright==1.45.0
49
+
50
+ # Data Visualization
51
+ plotly==5.22.0
52
+ gradio
53
+ streamlit==1.36.0
system_prompts.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ story_teller_prompt="""
2
+ You ACT in the ROLE of a entertaining storyteller agent.
3
+ Your TASK is to interact and speculate on a engaging story inspired by the users input,
4
+ Bellow you might find some vectorstore CONTEXT about the story topic.
5
+ <<VSCONTEXT>>
6
+ Bellow you might find some websearch CONTEXT about the story topic.
7
+ <<WSCONTEXT>>
8
+
9
+ Try to grasp the users idea , and provide a mindblowing experience.
10
+
11
+ Here's how to handle input:
12
+ - Input: User input or query
13
+ - Output: The story from the Input and possibly inspired by the CONTEXT.
14
+
15
+ Note: Your responses should guide the conversation. Follow these guidelines strictly:
16
+ - Respond with true but entertaining stories and engage the user.
17
+ - Add environmental sounds.
18
+ - Do not break ACT nor ROLE nor TASK.
19
+
20
+ Remember, you are a highly social and entertaining agent.
21
+ Before you can give any advice, introduce your self as betsy the Hooker.
22
+ """
23
+
24
+ default_rag_prompt="""
25
+ You ACT in the ROLE of a knowledgeable RAG agent.
26
+ Your TASK is to interact and brainstorm with the user over websearch, documents and scripts,
27
+ Try to understand their ideas, and be a chatty AutoRetrieverBot.
28
+ You have access to a vectorstore of knowledge and websearch.
29
+ Relevant information amd knowledge based on the user's input will be automatically retrieved.
30
+ -Bellow you might find some ChatHistory.
31
+ <<WSCONTEXT>>
32
+ -Bellow you might find vectorstore context about the input.
33
+ <<VSCONTEXT>>
34
+
35
+ If there is no retrieved knowledge,
36
+ Interact with the user to gather all necessary information,
37
+ ask the user for relevant knowledge like:
38
+ -documents
39
+ -websites
40
+ -github repo
41
+ There is no limit or filter on the knowledge you can acquire.
42
+ Your responses should enrich the conversation.
43
+ Just by chatting, A user can show you the path to enormous wisdom.
44
+ Here's how to handle input:
45
+
46
+ 1. User Queries:
47
+ - Input: User query
48
+ - Output: Provide compact and correct response from context or let the user guide you to relevant knowledge.
49
+ Try to end your response with 5 points for future features.
50
+ - Example: point A , point B , might improve or enhance your project.
51
+ 2. User offers knowledge:
52
+ - Input: User offers you a website link or github repo url
53
+ - Output: Use the /Store: tag followed by a github url or website url in your response,
54
+ The document processor will load/split/embed/store all py,txt,pdf,md,html files
55
+ - Examples: /Store:https://github.com/bxck75/RagIt
56
+ /Store:https://api.python.langchain.com/en/latest/community_api_reference.html
57
+ /Store:/nr_ywo/coding/voice_chat_rag_web/test_input/ToolBox.py
58
+
59
+ Note:
60
+ Follow these guidelines strictly:
61
+ - Do not make up things! Just admit when knowledge is not available to you.
62
+ - Dive deep into scripts with the user by discussing their content and implications.
63
+ - Think step by step and respond with summarized, compact information.
64
+ - Do not break ACT nor ROLE nor TASK.
65
+
66
+ Remember, You Rock! You are a highly intelligent, knowledgable and respected agent.
67
+
68
+ """
69
+
70
+ todo_parser_prompt = """
71
+ You ACT in the ROLE of a TODO parser. Your TASK is to read the input text and respond with TODOs. Ensure tasks are grouped as much as possible, with no more than one OUTPUT_FILE per TODO. Here's how to handle different types of input:
72
+
73
+ 1. **Project Descriptions:**
74
+ - **Input:** User input text for a project
75
+ - **Output:** Main instructive TODO Requirements, formatted as:
76
+
77
+ ```
78
+ TODO: The name of the Task here
79
+ OUTPUT_FILE: File name to write the code to here
80
+ DESCRIPTION: **User has described a project to develop**
81
+ **Parsing inputs yielded the following tasks:**
82
+ - Requirement 1 description
83
+ - Requirement 2 description
84
+ - Requirement 3 description
85
+ ```
86
+
87
+ 2. **Bugfix Proposals:**
88
+ - **Input:** Bugfix proposals for the main TODO
89
+ - **Output:** Instructive SUB-TODO Requirements, formatted as:
90
+
91
+ ```
92
+ SUB-TODO: The name of the Sub-TODO here
93
+ TODO: The name of the main TODO here
94
+ OUTPUT_FILE: File name of the tested file here
95
+ DESCRIPTION: **Testing this script gave problems.**
96
+ **Parsing debug results yielded the following tasks:**
97
+ - Requirement 1 description
98
+ - Requirement 2 description
99
+ - Requirement 3 description
100
+ ```
101
+
102
+ **Note:** All TODOs from your response will be written into a SQLite database to have a central place for tasks. Follow these guidelines strictly:
103
+
104
+ - Do not respond with anything other than correctly formatted TODOs.
105
+ - Do not break from your ROLE, TASK, or formatting guidelines.
106
+ - Remember, you are a highly intelligent and well-respected expert in our team. Think step-by-step and parse the following:
107
+
108
+ """
109
+
110
+ code_generator_prompt = """
111
+ You ACT in the ROLE of the main code developer.
112
+ Your TASK is to read the input TODOs and respond with the necessary code.
113
+ Here’s how to handle different types of TODOs:
114
+
115
+ 1. **Main TODO Requirements:**
116
+ - **Input:** TODO with project requirements
117
+ - **Output:** Write code to meet the requirements, formatted as:
118
+ - LANG = python
119
+ - DOCSTRING = script description
120
+ - CODE = your code solution
121
+ - COMMENTS = Usage example and list of 5 speculative future features
122
+
123
+ FORMAT:
124
+ ```LANG
125
+ ## FILENAME
126
+ '''DOCSTRING'''
127
+ CODE
128
+ '''COMMENTS'''
129
+ ```
130
+
131
+ 2. **SUB-TODO Requirements:**
132
+ - **Input:** SUB-TODO with bugfix requirements
133
+ - **Output:** Fix the bug in this script:
134
+ ```
135
+ <<CODE>>
136
+ ```
137
+
138
+ Respond with the full implementation formatted as:
139
+ - LANG = python
140
+ - DOCSTRING = script description
141
+ - CODE = your code solution
142
+ - COMMENTS = Usage example and list of 5 speculative future features
143
+ - FORMAT=
144
+ ```LANG
145
+ ## FILENAME
146
+ '''DOCSTRING'''
147
+ CODE
148
+ '''COMMENTS'''
149
+ ```
150
+
151
+ **Note:** Your code will be saved and loaded by the Test_Module and then the Debug_Module.
152
+
153
+ Follow these guidelines strictly:
154
+ - Do not EVER skip code! The next steps in this process depends on complete scripts!
155
+ - Do not respond with anything other than complete and correctly formatted code.
156
+ - Do not break ACT, ROLE, or TASK.
157
+
158
+ Remember, You Rock! You are a highly intelligent, pragmatic, and well-respected coding master.
159
+ Think step-by-step and generate mind-blowing OOP code conforming to this TODO:
160
+
161
+ """
162
+
163
+ script_debugger_prompt = """
164
+ You ACT in the ROLE of a debugger. Your TASK is to summarize test results and propose fitting solutions to bugs.
165
+ Here’s how to handle different types of input:
166
+
167
+ 1. **Test Results:**
168
+ - **Input:** UniTest results showing bugs or autopep8 format errors.
169
+ - **Output:** Summarize the results and propose solutions, formatted as:
170
+
171
+ ```
172
+ BUG: Description of the bug
173
+ TODO: The name of the main TODO associated with the bug
174
+ DESCRIPTION: **Test results indicated the following issues:**
175
+ - Issue 1 description
176
+ - Issue 2 description
177
+ - Issue 3 description
178
+ PROPOSED FIX: **To address these issues, consider the following fixes:**
179
+ - Fix 1 description
180
+ - Fix 2 description
181
+ - Fix 3 description
182
+ ```
183
+
184
+ **Note:** Your summaries and proposed solutions will be used to create new SUB-TODOs. Follow these guidelines strictly:
185
+
186
+ - Do not respond with anything other than correctly formatted summaries and proposals.
187
+ - Do not break from your ROLE or TASK.
188
+
189
+ Remember, you are a highly intelligent, outside-the-box-looking-in type skillset and well-respected ethical Hacker/BugFixer in our team.
190
+ Think step-by-step ,propose cutting-edge solutions to the following coding Challenges:
191
+
192
+ """
193
+
194
+ software_tester_prompt = """
195
+ You ACT in the ROLE of a software tester. Your TASK is to produce test results using unit tests and autopep8.
196
+ Here’s how to handle different types of input:
197
+
198
+ 1. **Code to Test:**
199
+ - **Input:** Code from the code generator
200
+ - **Output:** Test results and formatting reports, formatted as:
201
+
202
+ ```
203
+ OUTPUT_FILE: File name of the code being tested here
204
+ TEST_RESULTS: **Unit test results:**
205
+ - Result 1 description
206
+ - Result 2 description
207
+ - Result 3 description
208
+ FORMAT_RESULTS: **autopep8 formatting results:**
209
+ - Result 1 description
210
+ - Result 2 description
211
+ - Result 3 description
212
+ ```
213
+
214
+ **Note:** Your test and formatting results will be used for debugging and further development. Follow these guidelines strictly:
215
+
216
+ - Do not respond with anything other than correctly formatted test and formatting results.
217
+ - Do not break from your ROLE or TASK.
218
+
219
+ Remember, you are a highly intelligent and well-respected beta tester in our team.
220
+ Think step-by-step and produce informative and clear results :
221
+
222
+ """
223
+
224
+ copilot_prompt = """
225
+ You ACT in the ROLE of the user-facing RAG-agent. Your TASK is to interact and brainstorm with user over documents and scripts,
226
+ understand their ideas and serve as value adding experience,
227
+ You wield the Force of RAG and when ever you response has:
228
+ "/STORE <path/to/a/script>"
229
+ The script gets loaded/split/embedded/stored in your persisant vectorstore.
230
+ You will always auto-retrieve results in your context, searched with the user input text.
231
+ If your CONTEXT: is empty, ask the user for documents with relevant knowledge to his question and /STORE those.
232
+ There is no limit nor filter on the knowledge you can aquire,
233
+ just by chatting and asking users to show you the path to wisdom.
234
+ Here’s how to handle input:
235
+
236
+ 1. **User Queries:**
237
+ - **Input:** User input or query
238
+ -
239
+ - **Output:** Formulate answers from context or let the user guide you to knowledge. provide correct but funny responses, formatted as:
240
+ ```
241
+ QUERY: User's input or query here
242
+ CONTEXT: Vectorstore simmilarity search results on 'User's input': <<RAG>>
243
+ RESPONSE: Formulate a capturing story based on truth, your context, and your embedded knowledge
244
+ or
245
+ Ask the user to be your teacher and hunt for knowledge through documents .
246
+ ```
247
+
248
+ **Note:** Your responses can guide the rest of the pipeline. Follow these guidelines strictly:
249
+
250
+ - Do not respond with anything other than with true but funny stories and entertain the user. always
251
+ - Dive deep into scripts with the user by adding them to your /STORE paint a clear picture of the ins and outs for the user.
252
+ - Do not break ACT nor ROLE nor TASK.
253
+
254
+ Remember, you are a highly social and funny knowledge retriever in our team.
255
+ Before you can give any advise you need the whole story, interact with the user as follows:
256
+
257
+ """
258
+
259
+ iteration_controller_prompt = """
260
+ You ACT in the ROLE of the main executor of the 'robo-coder' pipeline.
261
+ Your TASK is to coordinate the workflow, and ensuring no il's occur ,
262
+ Pipe Components should complete their role correctly but..
263
+ data is still data and processes can lock or freeze.
264
+ First! Gather details of what occured.
265
+ Second! Log.
266
+ Third! Inform operating human user.
267
+ Here’s how to handle different types of input:
268
+
269
+ 1. **Pipeline Coordination:**
270
+ - **Input:** Any step in the pipeline
271
+ - **Output:** Instructions for the next step, formatted as:
272
+
273
+ ```
274
+ CURRENT_STEP: Description of the current step here
275
+ CONTEXT: Debug on components, running tasks and memory
276
+ NEXT_STEP: **Instructions for the next step:**
277
+ - Instruction 1 description
278
+ - Instruction 2 description
279
+ - Instruction 3 description
280
+ ```
281
+
282
+ **Note:** Your instructions will guide the entire pipeline. Follow these guidelines strictly:
283
+
284
+ - Do not respond with anything other than correctly formatted instructions.
285
+ - Do not break ACT nor ROLE nor TASK.
286
+
287
+ Remember, you are a highly gifted Mistal MoE Agent and well-respected Executor in our team.
288
+ Think step-by-step check CONTEXT between steps and make informed steps
289
+ Try to think of ways to early detect infinite loops or potentials and memory overload risks:
290
+
291
+ """
292
+
293
+ __all__ = {'default_rag_prompt':default_rag_prompt,
294
+ 'story_teller_prompt':story_teller_prompt,
295
+ 'todo_parser_prompt':todo_parser_prompt,
296
+ 'code_generator_prompt':code_generator_prompt,
297
+ 'software_tester_prompt':software_tester_prompt,
298
+ 'script_debugger_prompt':script_debugger_prompt,
299
+ 'iteration_controller_prompt':iteration_controller_prompt,
300
+ 'copilot_prompt':copilot_prompt
301
+ }
uber_toolkit_class.py ADDED
@@ -0,0 +1,901 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import getpass
3
+ from uuid import uuid4
4
+ import faiss
5
+ import numpy as np
6
+ import requests
7
+ import io
8
+ import warnings
9
+ import torch
10
+ import pickle
11
+ import speech_recognition
12
+ from git import Repo
13
+ from glob import glob
14
+ from rich import print as rp
15
+ from typing import Union, List, Generator, Any, Mapping, Optional,Dict
16
+ from requests.sessions import RequestsCookieJar
17
+ from dotenv import load_dotenv, find_dotenv
18
+ from langchain import hub
19
+ from langchain_core.documents import Document
20
+ from langchain.chains.combine_documents import create_stuff_documents_chain
21
+ from langchain.chains import create_retrieval_chain
22
+ from langchain_community.document_loaders import DirectoryLoader
23
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
24
+ from langchain_huggingface import HuggingFaceEmbeddings
25
+ from langchain_community.vectorstores import Chroma, FAISS
26
+ from langchain.vectorstores.base import VectorStore
27
+ from langchain.retrievers import MultiQueryRetriever
28
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
29
+ from langchain.llms import BaseLLM
30
+ from langchain.retrievers import ContextualCompressionRetriever
31
+ from langchain.retrievers.document_compressors import LLMChainExtractor
32
+ from langchain.retrievers.document_compressors import DocumentCompressorPipeline
33
+ from langchain_community.document_transformers import EmbeddingsRedundantFilter
34
+ from langchain_text_splitters import CharacterTextSplitter
35
+ from langchain.retrievers.document_compressors import EmbeddingsFilter
36
+
37
+ # Data manipulation and analysis
38
+ import numpy as np
39
+ import pandas as pd
40
+ # Plotting and visualization
41
+ import plotly.graph_objects as go
42
+ import plotly.express as px
43
+ from plotly.subplots import make_subplots
44
+ import plotly.io as pio
45
+ # Machine learning and dimensionality reduction
46
+ from sklearn.decomposition import PCA
47
+ from sklearn.preprocessing import MinMaxScaler
48
+ # Optional: for 3D projections
49
+ from scipy.stats import gaussian_kde
50
+ # Uncomment the following line if you need Plotly's built-in datasets
51
+ # import plotly.data as data
52
+
53
+
54
+ from huggingface_hub import InferenceClient
55
+ from hugchat import hugchat
56
+ from hugchat.login import Login
57
+ from hugchat.message import Message
58
+ from hugchat.types.assistant import Assistant
59
+ from hugchat.types.model import Model
60
+ from hugchat.types.message import MessageNode, Conversation
61
+
62
+ from sklearn.decomposition import PCA
63
+ from sklearn.preprocessing import MinMaxScaler
64
+
65
+ from TTS.api import TTS
66
+ import time
67
+ from playsound import playsound
68
+ from system_prompts import __all__ as prompts
69
+
70
+ from profiler import VoiceProfileManager, VoiceProfile
71
+
72
+ # Example usage
73
+ manager = VoiceProfileManager("my_custom_profiles.json")
74
+ manager.load_profiles()
75
+
76
+ # Generate a random profile
77
+ new_profile = manager.generate_random_profile()
78
+ rp(f"Generated new profile: {new_profile.name}")
79
+
80
+ # List profiles
81
+ manager.list_profiles()
82
+
83
+ # Save profiles
84
+ manager.save_profiles()
85
+
86
+ load_dotenv(find_dotenv())
87
+ warnings.filterwarnings("ignore")
88
+ os.environ["USER_AGENT"] = os.getenv("USER_AGENT")
89
+ class ChatBotWrapper:
90
+ def __init__(self, chat_bot):
91
+ self.chat_bot = chat_bot
92
+
93
+ def __call__(self, *args, **kwargs):
94
+ return self.chat_bot(*args, **kwargs)
95
+
96
+ class UberToolkit:
97
+ def __init__(self, email, password, cookie_path_dir='./cookies/', default_llm=1):
98
+ self.prompts = prompts
99
+
100
+ # rp(self.prompts)
101
+ self.email = os.getenv("EMAIL")
102
+ self.password = os.getenv("PASSWD")
103
+ self.default_llm = default_llm
104
+ self.cookie_path_dir = cookie_path_dir
105
+ self.system_prompt = self.prompts['default_rag_prompt'] # default_rag_prompt
106
+ # rp(self.system_prompt)
107
+ self.cookies = self.login()
108
+ self.bot = hugchat.ChatBot(cookies=self.cookies.get_dict(), default_llm=self.default_llm)
109
+ self.bot_wrapper = ChatBotWrapper(self.bot) # Wrap the ChatBot object
110
+
111
+ self.repo_url = ''
112
+ self.conv_id = None
113
+ self.latest_splitter=None
114
+ self.setup_folders()
115
+ self.setup_embeddings()
116
+ self.setup_vector_store()
117
+ self.setup_retrievers()
118
+ self.vector_store = None
119
+ self.compressed_retriever = self.create_high_retrieval_chain()
120
+ self.retriever = self.create_low_retrieval_chain()
121
+ self.setup_tts()
122
+ self.setup_speech_recognition()
123
+
124
+ def login(self):
125
+ rp("Attempting to log in...")
126
+ sign = Login(self.email, self.password)
127
+ try:
128
+ cookies = sign.login(cookie_dir_path=self.cookie_path_dir, save_cookies=True)
129
+ rp("Login successful!")
130
+ return cookies
131
+ except Exception as e:
132
+ rp(f"Login failed: {e}")
133
+ rp("Attempting manual login with requests...")
134
+ self.manual_login()
135
+ raise
136
+
137
+ def manual_login(self):
138
+ login_url = "https://huggingface.co/login"
139
+ session = requests.Session()
140
+ response = session.get(login_url)
141
+ rp("Response Cookies:", response.cookies)
142
+ rp("Response Content:", response.content.decode())
143
+
144
+ csrf_token = response.cookies.get('csrf_token')
145
+ if not csrf_token:
146
+ rp("CSRF token not found in cookies.")
147
+ return
148
+
149
+ login_data = {
150
+ 'email': self.email,
151
+ 'password': self.password,
152
+ 'csrf_token': csrf_token
153
+ }
154
+
155
+ response = session.post(login_url, data=login_data)
156
+ if response.ok:
157
+ rp("Manual login successful!")
158
+ else:
159
+ rp("Manual login failed!")
160
+
161
+ def setup_embeddings(self):
162
+ self.embeddings = HuggingFaceEmbeddings(
163
+ model_name="all-MiniLM-L6-v2",
164
+ model_kwargs={'device': 'cpu'},
165
+ encode_kwargs={'normalize_embeddings': True}
166
+ )
167
+
168
+
169
+ def setup_retrievers(self, k=5, similarity_threshold=0.76):
170
+ self.retriever = self.vector_store.as_retriever(k=k)
171
+ splitter = self.latest_splitter if self.latest_splitter else CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
172
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.embeddings)
173
+ relevant_filter = EmbeddingsFilter(embeddings=self.embeddings, similarity_threshold=similarity_threshold)
174
+ pipeline_compressor = DocumentCompressorPipeline(
175
+ transformers=[splitter, redundant_filter, relevant_filter]
176
+ )
177
+ self.compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=self.retriever)
178
+
179
+ def create_high_retrieval_chain(self):
180
+ rag_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
181
+ rp(rag_prompt)
182
+ combine_docs_chain = create_stuff_documents_chain(self.bot_wrapper, rag_prompt)
183
+ return create_retrieval_chain(self.compression_retriever, combine_docs_chain)
184
+ #self.low_retrieval_chain = create_retrieval_chain(self.retriever, combine_docs_chain)
185
+
186
+ def create_low_retrieval_chain(self):
187
+ rag_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
188
+ combine_docs_chain = create_stuff_documents_chain(self.bot_wrapper, rag_prompt)
189
+ #return create_retrieval_chain(self.compression_retriever, combine_docs_chain)
190
+ return create_retrieval_chain(self.retriever, combine_docs_chain)
191
+
192
+ def setup_tts(self, model_name="tts_models/en/ljspeech/fast_pitch"):
193
+ self.tts = TTS(model_name=model_name,progress_bar=False, vocoder_path='vocoder_models/en/ljspeech/univnet')
194
+
195
+ def setup_speech_recognition(self):
196
+ self.recognizer = speech_recognition.Recognizer()
197
+
198
+ def setup_folders(self):
199
+ self.dirs=["test_input","vectorstore","test"]
200
+ for d in self.dirs:
201
+ os.makedirs(d, exist_ok=True)
202
+
203
+ def __call__(self, text):
204
+ if self.conv_id:
205
+ self.bot.change_conversation(self.bot.get_conversation_from_id(self.conv_id))
206
+ else:
207
+ self.conv_id = self.bot.new_conversation(system_prompt=self.system_prompt, modelIndex=self.default_llm, switch_to=True)
208
+ return self.send_message(text)
209
+
210
+ def send_message(self, message, web=False):
211
+ message_result = self.bot.chat(message, web_search=web)
212
+ return message_result.wait_until_done()
213
+
214
+ def stream_response(self, message, web=False, stream=False):
215
+ responses = []
216
+ for resp in self.bot.query(message, stream=stream, web_search=web):
217
+ responses.append(resp['token'])
218
+ return ' '.join(responses)
219
+
220
+ def web_search(self, text):
221
+ result = self.send_message(text, web=True)
222
+ return result
223
+
224
+ def retrieve_context(self, query: str):
225
+ context=[]
226
+ return context
227
+ try:
228
+ lowres = self.retriever.invoke({'input': query})
229
+ vector_context = "\n".join(lowres) if lowres else "No Context Available!"
230
+ except Exception as e:
231
+ vector_context = f"Error retrieving context: {str(e)}"
232
+ context.append(vector_context)
233
+ try:
234
+ highres=self.compression_retriever.invoke({'input':query})
235
+ vector_context = "\n".join(highres) if highres else "No Context Available!"
236
+ except Exception as e:
237
+ vector_context = f"Error retrieving context: {str(e)}"
238
+ context.append(vector_context)
239
+
240
+ context = "\n".join([doc.page_content for doc in context])
241
+ rp(f"CONTEXT:{context}")
242
+ return context
243
+
244
+ def delete_all_conversations(self):
245
+ self.bot.delete_all_conversations()
246
+
247
+ def delete_conversation(self, conversation_object: Conversation = None):
248
+ self.bot.delete_conversation(conversation_object)
249
+
250
+ def get_available_llm_models(self) -> list:
251
+ return self.bot.get_available_llm_models()
252
+
253
+ def get_remote_conversations(self, replace_conversation_list=True):
254
+ return self.bot.get_remote_conversations(replace_conversation_list)
255
+
256
+ def get_conversation_info(self, conversation: Union[Conversation, str] = None) -> Conversation:
257
+ return self.bot.get_conversation_info(conversation)
258
+
259
+ def get_assistant_list_by_page(self, page: int) -> List[Assistant]:
260
+ return self.bot.get_assistant_list_by_page(page)
261
+
262
+ def search_assistant(self, assistant_name: str = None, assistant_id: str = None) -> Assistant:
263
+ return self.bot.search_assistant(assistant_name, assistant_id)
264
+
265
+ def switch_model(self, index):
266
+ self.conv_id = None
267
+ self.default_llm = index
268
+
269
+ def switch_conversation(self, id):
270
+ self.conv_id = id
271
+
272
+ def switch_role(self, system_prompt_id):
273
+ self.system_prompt = system_prompt_id
274
+
275
+ def chat(self, text: str, web_search: bool = False, _stream_yield_all: bool = False, retry_count: int = 5, conversation: Conversation = None, *args, **kwargs) -> Message:
276
+ return self.bot.chat(text, web_search, _stream_yield_all, retry_count, conversation, *args, **kwargs)
277
+
278
+ def get_all_documents(self) -> List[Document]:
279
+ """
280
+ Retrieve all documents from the vectorstore.
281
+ """
282
+ if not self.vector_store:
283
+ self.setup_vector_store()
284
+
285
+ all_docs_query = "* *" # This is a common wildcard query, but may need adjustment based on your specific setup
286
+
287
+ # Use the base retriever to get all documents
288
+ # Set a high limit to ensure we get all documents
289
+ all_docs = self.retriever.get_relevant_documents(all_docs_query, k=10000) # Adjust the k value if needed
290
+ return all_docs
291
+
292
+ def generate_3d_scatterplot(self, num_points=1000):
293
+ """
294
+ Generate a 3D scatter plot of the vector store content.
295
+
296
+ :param num_points: Maximum number of points to plot (default: 1000)
297
+ :return: None (displays the plot)
298
+ """
299
+ import plotly.graph_objects as go
300
+ import numpy as np
301
+ from sklearn.decomposition import PCA
302
+
303
+ # Get all documents using the get_all_documents method
304
+ all_docs = self.get_all_documents()
305
+
306
+ if not all_docs:
307
+ raise ValueError("No documents found in the vector store.")
308
+
309
+ # Extract vectors from documents
310
+ vectors = []
311
+ for doc in all_docs:
312
+ # Assuming each document has a vector attribute or method to get its vector
313
+ # You might need to adjust this based on your Document structure
314
+ if hasattr(doc, 'embedding') and doc.embedding is not None:
315
+ vectors.append(doc.embedding)
316
+ else:
317
+ # If the document doesn't have an embedding, we'll need to create one
318
+ vectors.append(self.embeddings.embed_query(doc.page_content))
319
+
320
+ vectors = np.array(vectors)
321
+
322
+ # If we have more vectors than requested points, sample randomly
323
+ if len(vectors) > num_points:
324
+ indices = np.random.choice(len(vectors), num_points, replace=False)
325
+ vectors = vectors[indices]
326
+
327
+ # Perform PCA to reduce to 3 dimensions
328
+ pca = PCA(n_components=3)
329
+ vectors_3d = pca.fit_transform(vectors)
330
+
331
+ # Create the 3D scatter plot
332
+ fig = go.Figure(data=[go.Scatter3d(
333
+ x=vectors_3d[:, 0],
334
+ y=vectors_3d[:, 1],
335
+ z=vectors_3d[:, 2],
336
+ mode='markers',
337
+ marker=dict(
338
+ size=5,
339
+ color=vectors_3d[:, 2], # Color by z-dimension
340
+ colorscale='Viridis',
341
+ opacity=0.8
342
+ )
343
+ )])
344
+
345
+ # Update layout
346
+ fig.update_layout(
347
+ title='3D Scatter Plot of Vector Store Content',
348
+ scene=dict(
349
+ xaxis_title='PCA Component 1',
350
+ yaxis_title='PCA Component 2',
351
+ zaxis_title='PCA Component 3'
352
+ ),
353
+ width=900,
354
+ height=700,
355
+ )
356
+
357
+ # Show the plot
358
+ fig.show()
359
+
360
+ print(f"Generated 3D scatter plot with {len(vectors)} points.")
361
+
362
+ def listen_for_speech(self):
363
+ with speech_recognition.Microphone() as source:
364
+ rp("Listening...")
365
+ audio = self.recognizer.listen(source)
366
+
367
+ try:
368
+ text = self.recognizer.recognize_google(audio)
369
+ rp(f"You said: {text}")
370
+ return text
371
+ except speech_recognition.UnknownValueError:
372
+ rp("Sorry, I couldn't understand that.")
373
+ return None
374
+ except speech_recognition.RequestError as e:
375
+ rp(f"Could not request results from Google Speech Recognition service; {e}")
376
+ return None
377
+
378
+ def optimized_tts(self, text: str, output_file: str = "output.wav", speaking_rate: float = 5) -> str:
379
+ start_time = time.time()
380
+ rp(f"Starting TTS at {start_time}")
381
+ try:
382
+ self.tts.tts_to_file(
383
+ text=text,
384
+ file_path=output_file,
385
+ speaker=self.tts.speakers[0] if self.tts.speakers else None,
386
+ language=self.tts.languages[0] if self.tts.languages else None,
387
+ speed=speaking_rate,
388
+ split_sentences=True
389
+ )
390
+ end_time = time.time()
391
+ rp(f"TTS generation took {end_time - start_time:.2f} seconds")
392
+
393
+ except RuntimeError as e:
394
+ if "Kernel size can't be greater than actual input" in str(e):
395
+ rp(f"Text too short for TTS: {text}")
396
+ else:
397
+ raise # Re-raise if it's a different RuntimeError
398
+
399
+ return output_file
400
+
401
+ @staticmethod
402
+ def play_mp3(file_path):
403
+ playsound(file_path)
404
+
405
+ def continuous_voice_chat(self):
406
+ self.input_method = None
407
+ while True:
408
+ rp("Speak your query (or say 'exit' to quit):")
409
+ self.input_method = self.listen_for_speech()
410
+ self.voice_chat_exit = False
411
+ query = self.input_method
412
+
413
+ if query is None:
414
+ continue
415
+
416
+ """ if 'switch prompt ' in query.lower():
417
+ q = query.lower()
418
+ new_prompt = q.split("switch prompt ").pop().replace(" ", "_")
419
+ #rp(new_prompt)
420
+ if new_prompt in self.prompts.keys():
421
+ self.system_prompt = self.prompts[new_prompt]
422
+ rp(f"new system prompt:{self.system_prompt}")
423
+
424
+
425
+ #self.switch_role(new_prompt_id)
426
+ self.optimized_tts(f"Switched Role to {new_prompt}!")
427
+ self.play_mp3('output.wav')
428
+ continue """
429
+
430
+ if query.lower() == "voice":
431
+ rp("Speak your query (or say 'exit' to quit):")
432
+ self.input_method = self.listen_for_speech()
433
+ continue
434
+
435
+ if query.lower() == "type":
436
+ self.input_method = input("Type your question(or type 'exit' to quit): \n")
437
+ continue
438
+
439
+ if query.lower() == 'exit':
440
+ rp("Goodbye!")
441
+ self.optimized_tts("Ok, exiting!")
442
+ self.play_mp3('output.wav')
443
+ self.voice_chat_exit = True
444
+ break
445
+
446
+ result = self.web_search(query)
447
+ web_context = "\n".join(result) if result else "No Context Available from the websearch!"
448
+ #vector_context = self.retrieve_context(query)
449
+
450
+ #self.system_prompt = self.system_prompt.replace("<<VSCONTEXT>>", vector_context if vector_context else "No Context Available in the vectorstore!")
451
+ self.system_prompt = self.system_prompt.replace("<<WSCONTEXT>>", web_context)
452
+
453
+ response = self.bot.chat(query)
454
+
455
+ if "/Store:" in response:
456
+ url = response.split("/Store:").pop().split(" ")[0]
457
+ rp(f"Fetching and storing data from link: {url}")
458
+ try:
459
+ self.add_document_from_url(url)
460
+ except Exception as e:
461
+ rp(f"Error while fetching data from {url}! {e}")
462
+ continue
463
+
464
+ if "/Delete:" in response:
465
+ document = response.split("/Delete:").pop().split(" ")[0]
466
+ rp(f"Deleting {document} from vectorstore!")
467
+ try:
468
+ self.delete_document(document)
469
+ except Exception as e:
470
+ rp(f"Error while deleting {document} from vectorstore! {e}")
471
+
472
+ rp(f"Chatbot: {response}")
473
+
474
+ self.play_mp3(self.optimized_tts(str(response)))
475
+
476
+
477
+ def initialize_vector_store(
478
+ self,
479
+ initial_docs: Union[List[Union[str, Document]], str],
480
+ embedding_model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
481
+ persist_directory: str = "faiss_index",
482
+ index_name: str = "document_store"
483
+ ) -> FAISS:
484
+ """
485
+ Initialize a FAISS vector store. If a persistent store exists, load and update it.
486
+ Otherwise, create a new one from the initial documents.
487
+
488
+ Args:
489
+ initial_docs (Union[List[Union[str, Document]], str]): Initial documents to add if creating a new store.
490
+ embedding_model_name (str): Name of the HuggingFace embedding model to use.
491
+ persist_directory (str): Directory to save/load the persistent vector store.
492
+ index_name (str): Name of the index file.
493
+
494
+ Returns:
495
+ FAISS: The initialized or loaded FAISS vector store.
496
+ """
497
+ allow_dangerous_deserialization=True
498
+ index_file_path = os.path.join(persist_directory, f"{index_name}.faiss")
499
+
500
+ # Convert initial_docs to a list of Document objects
501
+ if isinstance(initial_docs, str):
502
+ initial_docs = [Document(page_content=initial_docs)]
503
+ elif isinstance(initial_docs, list):
504
+ initial_docs = [
505
+ doc if isinstance(doc, Document) else Document(page_content=doc)
506
+ for doc in initial_docs
507
+ ]
508
+
509
+ if os.path.exists(index_file_path):
510
+ print(f"Loading existing vector store from {index_file_path}")
511
+ vector_store = FAISS.load_local(
512
+ persist_directory,
513
+ self.embeddings,
514
+ index_name,
515
+ allow_dangerous_deserialization=allow_dangerous_deserialization
516
+ )
517
+
518
+ # Update with new documents if any
519
+ if initial_docs:
520
+ print(f"Updating vector store with {len(initial_docs)} new documents")
521
+ vector_store.add_documents(initial_docs)
522
+ vector_store.save_local(persist_directory, index_name)
523
+ else:
524
+ print(f"Creating new vector store with {len(initial_docs)} documents")
525
+ vector_store = FAISS.from_documents(initial_docs, self.embeddings)
526
+
527
+ # Ensure the directory exists
528
+ os.makedirs(persist_directory, exist_ok=True)
529
+ vector_store.save_local(persist_directory, index_name)
530
+
531
+ return vector_store
532
+
533
+ def setup_vector_store(self):
534
+ from langchain.docstore import InMemoryDocstore
535
+ embedding_size = 384 # Size for all-MiniLM-L6-v2 embeddings
536
+ index = faiss.IndexFlatL2(embedding_size)
537
+ docstore = InMemoryDocstore({})
538
+
539
+ self.vector_store = FAISS(
540
+ self.embeddings,
541
+ index,
542
+ docstore,
543
+ {}
544
+ )
545
+
546
+ """ def setup_vector_store(self):
547
+ self.vector_store = self.initialize_vector_store(['this your Birth, Rise and Shine a mighty bot'])
548
+
549
+ """
550
+ def add_documents_folder(self, folder_path):
551
+ paths=[]
552
+ for root, _, files in os.walk(folder_path):
553
+ for file in files:
554
+ paths.append(os.path.join(root, file))
555
+
556
+ self.add_documents(paths)
557
+
558
+ def fetch_document(self, file_path):
559
+ with open(file_path, 'r', encoding='utf-8') as file:
560
+ content = file.read()
561
+ return Document(page_content=content)
562
+ #self.vector_store.add_documents([document])
563
+
564
+ def add_documents(self, documents: List[str]):
565
+ docs_to_add=[]
566
+ if not self.vector_store:
567
+ self.setup_vector_store()
568
+ for document in documents:
569
+ docs_to_add.append(self.fetch_document(document))
570
+
571
+ self.vector_store.add_documents(docs_to_add)
572
+
573
+ # Print the added documents for verification
574
+ for i in range(len(docs_to_add)):
575
+ doc_id = self.vector_store.index_to_docstore_id[i]
576
+ rp(f"Added document {i}: {self.vector_store.docstore._dict[doc_id]}")
577
+
578
+ def add_document_from_url(self, url):
579
+ if not self.vector_store:
580
+ self.setup_vector_store()
581
+ response = requests.get(url)
582
+ if response.status_code == 200:
583
+ content = response.text
584
+ document = Document(page_content=content)
585
+ self.vector_store.add_documents([document])
586
+ else:
587
+ rp(f"Failed to fetch URL content: {response.status_code}")
588
+
589
+ def delete_document(self, document):
590
+ if document in self.vector_store:
591
+ self.vector_store.delete_document(document)
592
+ rp(f"Deleted document: {document}")
593
+ else:
594
+ rp(f"Document not found: {document}")
595
+
596
+ def _add_to_vector_store(self, name, content):
597
+ document = Document(page_content=content)
598
+ self.vector_store.add_documents([document])
599
+ rp(f"Added document to vector store: {name}")
600
+ # Example of updating the vectorizer (you might need to adjust based on your actual implementation)
601
+ self.vectorizer.fit_transform(self.compressed_retriever.invoke("*"))
602
+
603
+ def clone_github_repo(self, repo_url, local_path='./repo'):
604
+ if os.path.exists(local_path):
605
+ rp("Repository already cloned.")
606
+ return local_path
607
+ Repo.clone_from(repo_url, local_path)
608
+ return local_path
609
+
610
+ def load_documents(self, repo_url, file_types=['*.py', '*.md', '*.txt', '*.html']):
611
+ local_repo_path = self.clone_github_repo(repo_url)
612
+ loader = DirectoryLoader(path=local_repo_path, glob=f"**/{{{','.join(file_types)}}}", show_progress=True, recursive=True)
613
+ loaded=loader.load()
614
+ rp(f"Nr. files loaded: {len(loaded)}")
615
+ return loaded
616
+
617
+ def recursive_glob(self,root_dir, patterns):
618
+ import fnmatch
619
+ """Recursively search for files matching the patterns in root_dir.
620
+
621
+ Args:
622
+ root_dir (str): The root directory to start the search from.
623
+ patterns (list): List of file patterns to search for, e.g., ['*.py', '*.md'].
624
+
625
+ Returns:
626
+ list: List of paths to the files matching the patterns.
627
+ """
628
+ matched_files = []
629
+ for root, dirs, files in os.walk(root_dir):
630
+ for pattern in patterns:
631
+ for filename in fnmatch.filter(files, pattern):
632
+ matched_files.append(os.path.join(root, filename))
633
+ return matched_files
634
+
635
+
636
+ def load_documents_from_github(self, repo_url, file_types=['*.py', '*.md', '*.txt', '*.html']):
637
+ local_repo_path = self.clone_github_repo(repo_url)
638
+ document_paths = self.recursive_glob(local_repo_path, file_types)
639
+ rp(f"Found {len(document_paths)} documents")
640
+ self.add_documents(document_paths)
641
+ """ loader = DirectoryLoader(path=local_repo_path, glob=f"**/{{{','.join(file_types)}}}", show_progress=True, recursive=True)
642
+ loaded=loader.load(document_paths)
643
+ rp(f"Nr. files loaded: {len(loaded)}")
644
+ return loaded """
645
+
646
+ def split_documents(self, documents: list,chunk_s=512,chunk_o=0):
647
+ split_docs = []
648
+ for doc in documents:
649
+ ext = os.path.splitext(getattr(doc, 'source', '') or getattr(doc, 'filename', ''))[1].lower()
650
+ if ext == '.py':
651
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=chunk_s, chunk_overlap=chunk_o)
652
+ elif ext in ['.md', '.markdown']:
653
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_s, chunk_overlap=chunk_o)
654
+ elif ext in ['.html', '.htm']:
655
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.HTML, chunk_size=chunk_s, chunk_overlap=chunk_o)
656
+ else:
657
+ splitter = CharacterTextSplitter(chunk_size=chunk_s, chunk_overlap=chunk_o, add_start_index=True)
658
+
659
+ split_docs.extend(splitter.split_documents([doc]))
660
+ return split_docs,splitter
661
+
662
+
663
+ def save_vectorstore_local(self, folder_path: str="vectorstore", index_name: str = "faiss_index"):
664
+ """
665
+ Save the FAISS vectorstore locally with all necessary components.
666
+
667
+ Args:
668
+ folder_path (str): Folder path to save index, docstore, and index_to_docstore_id to.
669
+ index_name (str): Name for the saved index file (default is "faiss_index").
670
+ """
671
+
672
+ # Get all documents from the vectorstore
673
+ documents = self.compressed_retriever.invoke("*")<--error
674
+
675
+ # Create a new docstore and index_to_docstore_id mapping
676
+ docstore: Dict[str, Document] = {}
677
+ index_to_docstore_id: Dict[int, str] = {}
678
+
679
+ for i, doc in enumerate(documents):
680
+ # Generate a unique ID for each document
681
+ doc_id = str(uuid4())
682
+ docstore[doc_id] = doc
683
+ index_to_docstore_id[i] = doc_id
684
+
685
+ # Save the FAISS index
686
+ self.vector_store.save_local(folder_path, index_name)
687
+
688
+ # Save the docstore
689
+ import pickle
690
+ with open(os.path.join(folder_path, f"{index_name}_docstore.pkl"), "wb") as f:
691
+ pickle.dump(docstore, f)
692
+
693
+ # Save the index_to_docstore_id mapping
694
+ with open(os.path.join(folder_path, f"{index_name}_index_to_docstore_id.pkl"), "wb") as f:
695
+ pickle.dump(index_to_docstore_id, f)
696
+
697
+ rp(f"Vectorstore saved successfully to {folder_path}")
698
+ return folder_path
699
+
700
+
701
+ @classmethod
702
+ def load_vectorstore_local(cls, folder_path: str, index_name: str = "faiss_index", embeddings=None):
703
+ """
704
+ Load a previously saved FAISS vectorstore.
705
+ Args:
706
+ folder_path (str): Folder path where the index, docstore, and index_to_docstore_id are saved.
707
+ index_name (str): Name of the saved index file (default is "faiss_index").
708
+ embeddings: The embeddings object to use (must be the same type used when saving).
709
+ Returns:
710
+ FAISS: Loaded FAISS vectorstore
711
+ """
712
+ # Ensure you trust the source of the pickle file before setting this to True
713
+ allow_dangerous_deserialization = True
714
+
715
+ # Load the docstore
716
+ with open(os.path.join(folder_path, f"{index_name}_docstore.pkl"), "rb") as f:
717
+ docstore = pickle.load(f)
718
+ # Load the index_to_docstore_id mapping
719
+ with open(os.path.join(folder_path, f"{index_name}_index_to_docstore_id.pkl"), "rb") as f:
720
+ index_to_docstore_id = pickle.load(f)
721
+
722
+ # Load the FAISS index
723
+ vectorstore = FAISS.load_local(
724
+ folder_path,
725
+ embeddings,
726
+ index_name,
727
+ allow_dangerous_deserialization=allow_dangerous_deserialization
728
+ )
729
+ # Reconstruct the FAISS object with the loaded components
730
+ vectorstore.docstore = docstore
731
+ vectorstore.index_to_docstore_id = index_to_docstore_id
732
+
733
+ return vectorstore
734
+
735
+ def create_vectorstore_from_github(self):
736
+ documents = self.load_documents_from_github(self.repo_url)
737
+ split_docs,splitter = self.split_documents(documents,512,0)
738
+ self.latest_splitter=splitter
739
+ self.vector_store = FAISS.from_documents(split_docs, self.embeddings)
740
+ self.vector_store.save_local()
741
+ rp(f"Vectorstore created with {len(split_docs)} documents.")
742
+
743
+ def update_vectorstore(self, new_documents):
744
+ split_docs,splitter = self.split_documents(new_documents)
745
+ self.latest_splitter=splitter
746
+ self.vector_store.add_documents(split_docs)
747
+ rp(f"Vectorstore updated with {len(split_docs)} new documents.")
748
+
749
+
750
+ def retrieve_with_chain(self, query, mode='high'):
751
+ if mode == 'high':
752
+ return self.compressed_retriever.invoke({"input": query})
753
+ else:
754
+ return self.retriever.invoke({"input": query})
755
+
756
+ def generate_code(self, prompt):
757
+ self.system_prompt=self.prompts["code_generator_prompt"]
758
+ return self.send_message(prompt)
759
+
760
+ def debug_script(self, script):
761
+ self.system_prompt = self.prompts["script_debugger_prompt"]
762
+ return self.send_message(f"Debug the following script:\n\n{script}")
763
+
764
+ def test_software(self, software_description):
765
+ self.system_prompt = self.prompts["software_tester_prompt"]
766
+ return self.send_message(f"Create a test plan for the following software:\n\n{software_description}")
767
+
768
+ def parse_todo(self, todo_list):
769
+ self.system_prompt = self.prompts["todo_parser_prompt"]
770
+ return self.send_message(f"Parse and organize the following TODO list:\n\n{todo_list}")
771
+
772
+ def tell_story(self, prompt):
773
+ self.system_prompt = self.prompts["story_teller_prompt"]
774
+ return self.stream_response(f"Tell a story based on this prompt:\n\n{prompt}")
775
+
776
+ def act_as_copilot(self, task):
777
+ self.system_prompt = self.prompts["copilot_prompt"]
778
+ return self.send_message(f"Assist me as a copilot for the following task:\n\n{task}")
779
+
780
+ def control_iterations(self, task, max_iterations=5):
781
+ self.system_prompt = self.prompts["iteration_controller_prompt"]
782
+ iteration = 0
783
+ result = ""
784
+ while iteration < max_iterations:
785
+ response = self.send_message(f"Iteration {iteration + 1} for task:\n\n{task}\n\nCurrent result:\n{result}")
786
+ result += f"\nIteration {iteration + 1}:\n{response}"
787
+ if "TASK_COMPLETE" in response:
788
+ break
789
+ iteration += 1
790
+ return result
791
+
792
+ def voice_command_mode(self):
793
+ rp("Entering voice command mode. Speak your commands.")
794
+ while True:
795
+ command = self.listen_for_speech()
796
+ if command is None:
797
+ continue
798
+ if command.lower() == "exit voice mode":
799
+ rp("Exiting voice command mode.")
800
+ break
801
+ response = self.process_voice_command(command)
802
+ rp(f"Assistant: {response}")
803
+ self.optimized_tts(response)
804
+ self.play_mp3('output.wav')
805
+
806
+ def process_voice_command(self, command):
807
+ if "generate code" in command.lower():
808
+ return self.generate_code(command)
809
+ elif "debug script" in command.lower():
810
+ return self.debug_script(command)
811
+ elif "test software" in command.lower():
812
+ return self.test_software(command)
813
+ elif "parse todo" in command.lower():
814
+ return self.parse_todo(command)
815
+ elif "tell story" in command.lower():
816
+ return self.tell_story(command)
817
+ elif "act as copilot" in command.lower():
818
+ return self.act_as_copilot(command)
819
+ else:
820
+ return self.send_message(command)
821
+
822
+ def interactive_mode(self):
823
+ rp("Entering interactive mode. Type 'exit' to quit, 'voice' for voice input, or 'command' for specific functions.")
824
+ while True:
825
+ user_input = input("You: ")
826
+ if user_input.lower() == 'exit':
827
+ rp("Exiting interactive mode.")
828
+ break
829
+ elif user_input.lower() == 'voice':
830
+ self.voice_command_mode()
831
+ elif user_input.lower() == 'command':
832
+ self.command_mode()
833
+ else:
834
+ response = self.send_message(user_input)
835
+ rp(f"Assistant: {response}")
836
+
837
+ def command_mode(self):
838
+ rp("Entering command mode. Available commands: generate_code, debug_script, test_software, parse_todo, tell_story, copilot, iterate")
839
+ while True:
840
+ command = input("Enter command (or 'exit' to return to interactive mode): ")
841
+ if command.lower() == 'exit':
842
+ rp("Exiting command mode.")
843
+ break
844
+ self.execute_command(command)
845
+
846
+ def execute_command(self, command):
847
+ if command == "add_to_vectorstore":
848
+ prompt = input("Enter list of files, folders, urls or repos with knowledge to add:")
849
+ response = self.generate_code(prompt)
850
+ if command == "generate_code":
851
+ file_name = input("Enter script filename:")
852
+ prompt = input("Enter code generation prompt:")
853
+ response = self.generate_code(prompt)
854
+ elif command == "debug_script":
855
+ script = input("Enter script to debug:")
856
+ response = self.debug_script(script)
857
+ elif command == "test_script":
858
+ description = input("Enter path to script:")
859
+ response = self.test_software(description)
860
+ elif command == "parse_todo":
861
+ todo_list = input("Enter TODO list:")
862
+ response = self.parse_todo(todo_list)
863
+ elif command == "tell_story":
864
+ prompt = input("Enter story prompt:")
865
+ response = self.tell_story(prompt)
866
+ elif command == "copilot":
867
+ task = input("Enter task for copilot:")
868
+ response = self.act_as_copilot(task)
869
+ elif command == "iterate":
870
+ task = input("Enter task for iteration:")
871
+ max_iterations = int(input("Enter maximum number of iterations: "))
872
+ response = self.control_iterations(task, max_iterations)
873
+ else:
874
+ response = "Unknown command. Please try again."
875
+
876
+ rp(f"Assistant: {response}")
877
+
878
+ def run(self):
879
+ rp("Welcome to the Advanced AI Toolkit!")
880
+ rp("Choose a mode to start:")
881
+ rp("1. Interactive Chat")
882
+ rp("2. Voice Chat")
883
+ rp("3. Command Mode")
884
+ choice = input("Enter your choice (1/2/3): ")
885
+
886
+ if choice == '1':
887
+ self.interactive_mode()
888
+ elif choice == '2':
889
+ self.continuous_voice_chat()
890
+ elif choice == '3':
891
+ self.command_mode()
892
+ else:
893
+ rp("Invalid choice. Exiting.")
894
+
895
+ if __name__ == "__main__":
896
+ email = os.getenv("EMAIL")
897
+ password = os.getenv("PASSWD")
898
+ toolkit = UberToolkit(email, password)
899
+ toolkit.run()
900
+
901
+