Spaces:

Manel
/

Stoic

Sleeping

App Files Files Community

Manel commited on Mar 1, 2024

Commit

455c294

verified ·

1 Parent(s): b096383

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -16

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from langchain.vectorstores import Chroma
-@st.cache_resource
 def load_model(model_name):
     logger.info("Loading model ..")
     start_time = time.time()
@@ -48,7 +48,7 @@ def load_model(model_name):
     return model, tokenizer
-@st.cache_resource
 def load_db(device, local_embed=False,  CHROMA_PATH = './ChromaDB'):
     """
     Load vector embeddings and Chroma database
@@ -64,12 +64,9 @@ def load_db(device, local_embed=False,  CHROMA_PATH = './ChromaDB'):
         PATH_TO_EMBEDDING_FOLDER = ""
         # TODO : load only pytorch bin file
         embeddings = AutoModel.from_pretrained(PATH_TO_EMBEDDING_FOLDER, trust_remote_code=True)
-        embeddings = HuggingFaceBgeEmbeddings(model_name="whatever-model-you-are-using", model_kwargs={"trust_remote_code":True})
         logger.info('Loading embeddings locally.')
-        # Test the local embeddings
-        embed = embeddings.get_text_embedding("Hello World!")
-        print(len(embed))
-        print(embed[:5])
     else:
         embeddings = HuggingFaceBgeEmbeddings(model_name=embed_id , model_kwargs={"device": device}, encode_kwargs=encode_kwargs)
@@ -160,15 +157,15 @@ def llm_chain_with_context(model, model_name, query, context, template):
 def generate_response(query,  model, template):
     start_time = time.time()
-    progress_text = "Loading model. Please wait."
     my_bar = st.progress(0, text=progress_text)
-    context = fetch_context(db, model, model_name, query, template)
     # fill those as appropriate
-    my_bar.progress(0.1, "Loading Database.  Please wait.")
-    my_bar.progress(0.3, "Loading Model.  Please wait.")
-    my_bar.progress(0.5, "Running RAG.  Please wait.")
     my_bar.progress(0.7, "Generating Answer.  Please wait.")
     response = llm_chain_with_context(model, model_name, query, context, template)
@@ -205,6 +202,12 @@ def set_as_background_img(png_file):
     st.markdown(background_img, unsafe_allow_html=True)
     return
 if __name__=="__main__":
@@ -272,10 +275,17 @@ if __name__=="__main__":
                 Question: {question}\n> Context:\n>>>\n{context}\n>>>\nRelevant parts"""}
     db = load_db(device)
     model, tokenizer = load_model(model_name)
     response = False
     user_question = st.chat_input('What do you want to ask ..')
@@ -286,11 +296,12 @@ if __name__=="__main__":
             st.write(user_question)
         if response:
-            # to empty response container after first pass
-            st.chat_message("AI", avatar="🏛️").write(" ")
         response = generate_response(user_question,  model, all_templates)
         with st.chat_message("AI", avatar="🏛️"):
-            st.write(response)

+@st.cache_resource(show_spinner=False)
 def load_model(model_name):
     logger.info("Loading model ..")
     start_time = time.time()
     return model, tokenizer
+@st.cache_resource(show_spinner=False)
 def load_db(device, local_embed=False,  CHROMA_PATH = './ChromaDB'):
     """
     Load vector embeddings and Chroma database
         PATH_TO_EMBEDDING_FOLDER = ""
         # TODO : load only pytorch bin file
         embeddings = AutoModel.from_pretrained(PATH_TO_EMBEDDING_FOLDER, trust_remote_code=True)
+        embeddings = HuggingFaceBgeEmbeddings(model_name="  ", model_kwargs={"trust_remote_code":True})
         logger.info('Loading embeddings locally.')
     else:
         embeddings = HuggingFaceBgeEmbeddings(model_name=embed_id , model_kwargs={"device": device}, encode_kwargs=encode_kwargs)
 def generate_response(query,  model, template):
     start_time = time.time()
+    progress_text = "Running Inference. Please wait."
     my_bar = st.progress(0, text=progress_text)
     # fill those as appropriate
+    #my_bar.progress(0.1, "Loading Database.  Please wait.")
+    #my_bar.progress(0.3, "Loading Model.  Please wait.")
+    my_bar.progress(0.1, "Running RAG.  Please wait.")
+    context = fetch_context(db, model, model_name, query, template)
     my_bar.progress(0.7, "Generating Answer.  Please wait.")
     response = llm_chain_with_context(model, model_name, query, context, template)
     st.markdown(background_img, unsafe_allow_html=True)
     return
+def stream_to_screen(response):
+    for word in response.split():
+        yield word + " "
+        time.sleep(0.05)
 if __name__=="__main__":
                 Question: {question}\n> Context:\n>>>\n{context}\n>>>\nRelevant parts"""}
+    # Loading and caching db and model
+    my_bar = st.progress(0, "Loading Database.  Please wait.)
+    my_bar.progress(0.1, "Loading Embedding & Database.  Please wait.")
     db = load_db(device)
+    my_bar.progress(0.7, "Loading Model.  Please wait.")
     model, tokenizer = load_model(model_name)
+    my_bar.progress(1.0, "Done")
+    time. sleep(1)
+    my_bar.empty()
     response = False
     user_question = st.chat_input('What do you want to ask ..')
             st.write(user_question)
         if response:
+            with st.chat_message("AI", avatar="🏛️"):
+                # to empty response container after first pass
+                st.write(" ")
         response = generate_response(user_question,  model, all_templates)
         with st.chat_message("AI", avatar="🏛️"):
+            st.write(stream_to_screen(response))