K00B404 commited on
Commit
8d1e832
·
verified ·
1 Parent(s): e925bd7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +652 -0
app.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Here is one of the many custom scripts i build.
2
+ # Costs to use it is exactly 0
3
+ # Even runs with llama3.1 70B or 405B..and few more...
4
+
5
+ import streamlit as st
6
+ from llm_chatbot import LLMChatBot
7
+ from streamlit_option_menu import option_menu
8
+ import speech_recognition as sr
9
+ import pyttsx3
10
+ import os
11
+ import getpass
12
+ from uuid import uuid4
13
+ import faiss
14
+ import numpy as np
15
+ import requests
16
+ import io
17
+ import warnings
18
+ import torch
19
+ import pickle
20
+ import asyncio
21
+ import json
22
+ from git import Repo
23
+ from rich import print as rp
24
+ from typing import Union, List, Generator, Any, Mapping, Optional, Dict
25
+ from requests.sessions import RequestsCookieJar
26
+ from dotenv import load_dotenv, find_dotenv
27
+ from langchain import hub
28
+ from langchain_core.documents import Document
29
+ from langchain.chains.combine_documents import create_stuff_documents_chain
30
+ from langchain.chains import create_retrieval_chain
31
+ from langchain_community.document_loaders import DirectoryLoader
32
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
33
+ from langchain_huggingface import HuggingFaceEmbeddings
34
+ from langchain_community.vectorstores import Chroma, FAISS
35
+ from langchain.vectorstores.base import VectorStore
36
+ from langchain.retrievers import MultiQueryRetriever
37
+ from langchain.retrievers.self_query.base import SelfQueryRetriever
38
+ from langchain.llms import BaseLLM
39
+ from langchain.retrievers import ContextualCompressionRetriever
40
+ from langchain.retrievers.document_compressors import LLMChainExtractor
41
+ from langchain.retrievers.document_compressors import DocumentCompressorPipeline
42
+ from langchain_community.document_transformers import EmbeddingsRedundantFilter
43
+ from langchain_text_splitters import CharacterTextSplitter
44
+ from langchain.retrievers.document_compressors import EmbeddingsFilter
45
+ from langchain.memory.buffer import ConversationBufferMemory
46
+ from langchain.chains import StuffDocumentsChain, LLMChain, ConversationalRetrievalChain
47
+ from uber_toolkit_class import UberToolkit
48
+ from glob import glob
49
+ import numpy as np
50
+ import pandas as pd
51
+ import plotly.graph_objects as go
52
+ import plotly.express as px
53
+ from plotly.subplots import make_subplots
54
+ import plotly.io as pio
55
+ from sklearn.decomposition import PCA
56
+ from sklearn.preprocessing import MinMaxScaler
57
+ from langchain_core.documents import Document
58
+ from scipy.stats import gaussian_kde
59
+ from huggingface_hub import InferenceClient
60
+ from hugchat import hugchat
61
+ from hugchat.login import Login
62
+ from hugchat.message import Message
63
+ from hugchat.types.assistant import Assistant
64
+ from hugchat.types.model import Model
65
+ from hugchat.types.message import MessageNode, Conversation
66
+ from langchain_community.document_loaders import TextLoader
67
+ from TTS.api import TTS
68
+ import time
69
+ from playsound import playsound
70
+ from system_prompts import __all__ as prompts
71
+ from profiler import VoiceProfileManager, VoiceProfile
72
+
73
+ # Load environment variables
74
+ load_dotenv(find_dotenv())
75
+
76
+ class ChatbotApp:
77
+
78
+ def __init__(self, email, password, default_llm=1):
79
+
80
+ self.email = email
81
+
82
+ self.password = password
83
+
84
+ self.default_llm = default_llm
85
+
86
+ self.embeddings = HuggingFaceEmbeddings(
87
+
88
+ model_name="all-MiniLM-L6-v2",
89
+
90
+ model_kwargs={'device': 'cpu'},
91
+
92
+ encode_kwargs={'normalize_embeddings': True}
93
+
94
+ )
95
+
96
+ self.vectorstore = None
97
+
98
+
99
+ def create_vectorstore_from_github(self):
100
+
101
+ repo_url = "YOUR_REPO_URL"
102
+
103
+ local_repo_path = self.clone_github_repo(repo_url)
104
+
105
+ loader = DirectoryLoader(path=local_repo_path, glob=f"**/*", show_progress=True, recursive=True)
106
+
107
+ loaded_files = loader.load()
108
+
109
+ documents = [Document(page_content=file_content) for file_content in loaded_files]
110
+
111
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
112
+
113
+ split_documents = text_splitter.split_documents(documents)
114
+
115
+ texts = [doc.page_content for doc in split_documents]
116
+
117
+ print(f"Texts for embedding: {texts}") # Debug print
118
+
119
+ self.vectorstore = FAISS.from_texts(texts, self.embeddings)
120
+
121
+
122
+ def create_vectorstore(self, docs):
123
+
124
+
125
+
126
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
127
+
128
+ # Wrap text content in Document objects
129
+
130
+ documents = [Document(page_content=doc) for doc in docs]
131
+
132
+ # Split documents using the text splitter
133
+
134
+ split_documents = text_splitter.split_documents(documents)
135
+
136
+ # Convert split documents back to plain text
137
+
138
+ texts = [doc.page_content for doc in split_documents]
139
+
140
+ vectorstore = FAISS.from_texts(texts, self.setup_embeddings())
141
+
142
+ return vectorstore
143
+
144
+
145
+
146
+ def setup_session_state(self):
147
+
148
+ if 'chat_history' not in st.session_state:
149
+
150
+ st.session_state.chat_history = []
151
+
152
+ if 'voice_mode' not in st.session_state:
153
+
154
+ st.session_state.voice_mode = False
155
+
156
+ if 'vectorstore' not in st.session_state:
157
+
158
+ st.session_state.vectorstore = None
159
+
160
+ if 'retriever' not in st.session_state:
161
+
162
+ st.session_state.retriever = None
163
+
164
+ if 'compression_retriever' not in st.session_state:
165
+
166
+ st.session_state.compression_retriever = None
167
+
168
+
169
+
170
+ def text_to_speech(self, text):
171
+
172
+ self.engine.say(text)
173
+
174
+ self.engine.runAndWait()
175
+
176
+
177
+
178
+ def speech_to_text(self):
179
+
180
+ r = sr.Recognizer()
181
+
182
+ with sr.Microphone() as source:
183
+
184
+ st.write("Listening...")
185
+
186
+ audio = r.listen(source)
187
+
188
+ try:
189
+
190
+ text = r.recognize_google(audio)
191
+
192
+ return text
193
+
194
+ except:
195
+
196
+ return "Sorry, I didn't catch that."
197
+
198
+
199
+ def setup_embeddings(self):
200
+
201
+ return HuggingFaceEmbeddings(
202
+
203
+ model_name="all-MiniLM-L6-v2",
204
+
205
+ model_kwargs={'device': 'cpu'},
206
+
207
+ encode_kwargs={'normalize_embeddings': True}
208
+
209
+ )
210
+
211
+
212
+ def create_vector_store(self, docs):
213
+
214
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
215
+
216
+ # Wrap text content in Document objects
217
+
218
+ documents = [Document(page_content=doc) for doc in docs]
219
+
220
+ # Split documents using the text splitter
221
+
222
+ split_documents = text_splitter.split_documents(documents)
223
+
224
+ print(f"Split documents: {split_documents}") # Debug print
225
+
226
+ # Convert split documents back to plain text
227
+
228
+ texts = [doc.page_content for doc in split_documents]
229
+
230
+ print(f"Texts: {texts}") # Debug print
231
+
232
+ if not texts:
233
+
234
+ print("No valid texts found for embedding. Check your repository content.")
235
+
236
+ return
237
+
238
+
239
+ try:
240
+
241
+ self.vectorstore = FAISS.from_texts(texts, self.embeddings)
242
+
243
+ print("Vector store created successfully")
244
+
245
+ except Exception as e:
246
+
247
+ print(f"Error creating vector store: {str(e)}")
248
+
249
+
250
+ def setup_retriever(self, k=5, similarity_threshold=0.76):
251
+
252
+ self.retriever = st.session_state.vectorstore.as_retriever(k=k)
253
+
254
+ splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
255
+
256
+ redundant_filter = EmbeddingsRedundantFilter(embeddings=self.setup_embeddings())
257
+
258
+ relevant_filter = EmbeddingsFilter(embeddings=self.setup_embeddings(), similarity_threshold=similarity_threshold)
259
+
260
+ pipeline_compressor = DocumentCompressorPipeline(
261
+
262
+ transformers=[splitter, redundant_filter, relevant_filter]
263
+
264
+ )
265
+
266
+ st.session_state.compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=self.retriever)
267
+
268
+
269
+ def create_retrieval_chain(self):
270
+
271
+ rag_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
272
+
273
+ combine_docs_chain = create_stuff_documents_chain(self.llm, rag_prompt)
274
+
275
+ self.high_retrieval_chain = create_retrieval_chain(st.session_state.compression_retriever, combine_docs_chain)
276
+
277
+ self.low_retrieval_chain = create_retrieval_chain(self.retriever, combine_docs_chain)
278
+
279
+
280
+
281
+ def setup_tts(self, model_name="tts_models/en/ljspeech/fast_pitch"):
282
+
283
+ self.tts = TTS(model_name=model_name, progress_bar=False, vocoder_path='vocoder_models/en/ljspeech/univnet')
284
+
285
+
286
+ def setup_speech_recognition(self):
287
+
288
+ self.recognizer = sr.Recognizer()
289
+
290
+
291
+ def setup_folders(self):
292
+
293
+ self.dirs = ["test_input", "vectorstore", "test"]
294
+
295
+ for d in self.dirs:
296
+
297
+ os.makedirs(d, exist_ok=True)
298
+
299
+
300
+ def send_message(self, message, web=False):
301
+
302
+ message_result = self.llm.chat(message, web_search=web)
303
+
304
+ return message_result.wait_until_done()
305
+
306
+
307
+ def stream_response(self, message, web=False, stream=False):
308
+
309
+ responses = []
310
+
311
+ for resp in self.llm.query(message, stream=stream, web_search=web):
312
+
313
+ responses.append(resp['token'])
314
+
315
+ return ' '.join(responses)
316
+
317
+
318
+ def web_search(self, text):
319
+
320
+ result = self.send_message(text, web=True)
321
+
322
+ return result
323
+
324
+
325
+ def retrieve_context(self, query: str):
326
+
327
+ context = []
328
+
329
+ lowres = self.retriever._get_relevant_documents(query)
330
+
331
+ highres = st.session_state.compression_retriever.get_relevant_documents(query)
332
+
333
+ context = "\n".join([doc.page_content for doc in lowres + highres])
334
+
335
+ return context
336
+
337
+
338
+ def get_conversation_chain(self):
339
+
340
+ EMAIL = os.getenv("EMAIL")
341
+
342
+ PASSWD = os.getenv("PASSWD")
343
+
344
+ model = 1
345
+
346
+ self.llm = LLMChatBot(EMAIL, PASSWD, default_llm=model)
347
+ self.llm.create_new_conversation(system_prompt=self.llm.default_system_prompt, switch_to=True)
348
+
349
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
350
+
351
+ conversation_chain = ConversationalRetrievalChain.from_llm(
352
+
353
+ llm=self.llm,
354
+
355
+ retriever=st.session_state.vectorstore.as_retriever(),
356
+
357
+ memory=memory
358
+
359
+ )
360
+
361
+ return conversation_chain
362
+
363
+ async def handle_user_input(self, user_input):
364
+
365
+ response = st.session_state.conversation({'question': user_input})
366
+
367
+ st.session_state.chat_history = response['chat_history']
368
+
369
+
370
+
371
+ for i, message in enumerate(st.session_state.chat_history):
372
+
373
+ if i % 2 == 0:
374
+
375
+ st.write(f"Human: {message.content}")
376
+
377
+ else:
378
+
379
+ st.write(f"AI: {message.content}")
380
+
381
+ if st.session_state.voice_mode:
382
+
383
+ self.text_to_speech(message.content)
384
+
385
+ def clone_github_repo(self, repo_url, local_path='./repo'):
386
+
387
+ if os.path.exists(local_path):
388
+
389
+ st.write("Repository already cloned.")
390
+
391
+ return local_path
392
+
393
+ Repo.clone_from(repo_url, local_path)
394
+
395
+ return local_path
396
+
397
+
398
+ def glob_recursive_multiple_extensions(base_dir, extensions):
399
+
400
+ all_files = []
401
+
402
+ for ext in extensions:
403
+
404
+ pattern = os.path.join(base_dir, '**', f'*.{ext}')
405
+
406
+ files = glob(pattern, recursive=True)
407
+
408
+ all_files.extend(files)
409
+
410
+ return all_files
411
+
412
+
413
+ def load_documents_from_github(self, repo_url, file_types=['*.py', '*.md', '*.txt', '*.html']):
414
+
415
+ local_repo_path = self.clone_github_repo(repo_url)
416
+
417
+ globber=f"**/*/{{{','.join(file_types)}}}"
418
+
419
+ rp(globber)
420
+
421
+ loader = DirectoryLoader(path=local_repo_path, glob=globber, show_progress=True, recursive=True,loader_cls=TextLoader)
422
+
423
+ loaded_files = loader.load()
424
+
425
+ st.write(f"Nr. files loaded: {len(loaded_files)}")
426
+
427
+ print(f"Loaded files: {len(loaded_files)}") # Debug print
428
+
429
+ # Convert the loaded files to Document objects
430
+
431
+ documents = [Document(page_content=file_content) for file_content in loaded_files]
432
+
433
+ print(f"Documents: {documents}") # Debug print
434
+
435
+ return documents
436
+
437
+
438
+ def split_documents(self, documents, chunk_s=512, chunk_o=0):
439
+
440
+ split_docs = []
441
+
442
+ splitter=None
443
+
444
+ for doc in documents:
445
+
446
+ ext = os.path.splitext(getattr(doc, 'source', '') or getattr(doc, 'filename', ''))[1].lower()
447
+
448
+ if ext == '.py':
449
+
450
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=chunk_s, chunk_overlap=chunk_o)
451
+
452
+ elif ext in ['.md', '.markdown']:
453
+
454
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.MARKDOWN, chunk_size=chunk_s, chunk_overlap=chunk_o)
455
+
456
+ elif ext in ['.html', '.htm']:
457
+
458
+ splitter = RecursiveCharacterTextSplitter.from_language(language=Language.HTML, chunk_size=chunk_s, chunk_overlap=chunk_o)
459
+
460
+ else:
461
+
462
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_s, chunk_overlap=chunk_o)
463
+
464
+ split_docs.extend(splitter.split_documents([doc]))
465
+
466
+ return split_docs, splitter
467
+
468
+
469
+ def visualize_vectorstore(self):
470
+
471
+ if st.session_state.vectorstore is None:
472
+
473
+ st.write("Vectorstore is not initialized.")
474
+
475
+ return
476
+
477
+ documents = st.session_state.vectorstore.get_all_documents()
478
+
479
+ embeddings = [doc.embedding for doc in documents]
480
+
481
+ pca = PCA(n_components=3)
482
+
483
+ embeddings_3d = pca.fit_transform(embeddings)
484
+
485
+ scaler = MinMaxScaler()
486
+
487
+ embeddings_3d_normalized = scaler.fit_transform(embeddings_3d)
488
+
489
+ colors = embeddings_3d_normalized[:, 0]
490
+
491
+ hover_text = [f"Document {i}:<br>{doc.page_content[:100]}..." for i, doc in enumerate(documents)]
492
+
493
+ fig = go.Figure(data=[go.Scatter3d(
494
+
495
+ x=embeddings_3d_normalized[:, 0],
496
+
497
+ y=embeddings_3d_normalized[:, 1],
498
+
499
+ z=embeddings_3d_normalized[:, 2],
500
+
501
+ mode='markers',
502
+
503
+ marker=dict(
504
+
505
+ size=5,
506
+
507
+ color=colors,
508
+
509
+ colorscale='Viridis',
510
+
511
+ opacity=0.8
512
+
513
+ ),
514
+
515
+ text=hover_text,
516
+
517
+ hoverinfo='text'
518
+
519
+ )])
520
+
521
+
522
+ fig.update_layout(
523
+
524
+ title="Interactive 3D Vectorstore Document Distribution",
525
+
526
+ scene=dict(
527
+
528
+ xaxis_title="PCA Component 1",
529
+
530
+ yaxis_title="PCA Component 2",
531
+
532
+ zaxis_title="PCA Component 3"
533
+
534
+ ),
535
+
536
+ width=800,
537
+
538
+ height=600,
539
+
540
+ )
541
+
542
+ st.plotly_chart(fig)
543
+
544
+
545
+ def chatbot_page(self):
546
+
547
+ st.title("Chatbot")
548
+
549
+ # Toggle for voice mode
550
+
551
+ st.session_state.voice_mode = st.toggle("Voice Mode")
552
+
553
+ # File uploader for context injection
554
+
555
+ uploaded_file = st.file_uploader("Choose a file for context injection")
556
+
557
+ if uploaded_file is not None:
558
+
559
+ documents = [uploaded_file.read().decode()]
560
+
561
+ st.session_state.vectorstore = self.create_vector_store(documents)
562
+
563
+ st.session_state.conversation = self.get_conversation_chain()
564
+
565
+ # GitHub repository URL input
566
+
567
+ repo_url = st.text_input("Enter GitHub repository URL")
568
+
569
+ if repo_url:
570
+
571
+ documents = self.load_documents_from_github(repo_url)
572
+
573
+ split_docs, _ = self.split_documents(documents)
574
+
575
+ st.session_state.vectorstore = self.create_vector_store(split_docs)
576
+
577
+ st.session_state.conversation = self.get_conversation_chain()
578
+
579
+ # Chat interface
580
+
581
+ user_input = st.text_input("You: ", key="user_input")
582
+
583
+ if user_input:
584
+
585
+ asyncio.run(self.handle_user_input(user_input))
586
+
587
+ if st.session_state.voice_mode:
588
+
589
+ if st.button("Speak"):
590
+
591
+ user_speech = self.speech_to_text()
592
+
593
+ st.text_input("You: ", value=user_speech, key="user_speech_input")
594
+
595
+ if user_speech != "Sorry, I didn't catch that.":
596
+
597
+ asyncio.run(self.handle_user_input(user_speech))
598
+
599
+
600
+ def dashboard_page(self):
601
+
602
+ st.title("Dashboard")
603
+
604
+
605
+ if st.session_state.vectorstore is not None:
606
+
607
+ st.write("Vectorstore Visualization")
608
+
609
+ self.visualize_vectorstore()
610
+
611
+ else:
612
+
613
+ st.write("Vectorstore is not initialized. Please add documents in the Chatbot page.")
614
+
615
+
616
+ def main(self):
617
+
618
+ st.set_page_config(page_title="Enhanced Multi-page Chatbot App", layout="wide")
619
+
620
+ # Sidebar navigation
621
+
622
+ with st.sidebar:
623
+
624
+ selected = option_menu(
625
+
626
+ menu_title="Navigation",
627
+
628
+ options=["Chatbot", "Dashboard"],
629
+
630
+ icons=["chat", "bar-chart"],
631
+
632
+ menu_icon="cast",
633
+
634
+ default_index=0,
635
+
636
+ )
637
+
638
+ if selected == "Chatbot":
639
+
640
+ self.chatbot_page()
641
+
642
+ elif selected == "Dashboard":
643
+
644
+ self.dashboard_page()
645
+
646
+
647
+ if __name__ == "__main__":
648
+
649
+ app = ChatbotApp(os.getenv("EMAIL"),os.getenv("PASSWD"))
650
+
651
+ app.main()
652
+ #https://www.linkedin.com/pulse/multi-type-ragollama31-405b-chatbot-boudewijn-kooy-t5lue/?trackingId=Q5pqCmYoQYGWkbViMWtqLQ%3D%3D