Spaces:

JustKiddo
/

IOTraining

Sleeping

App Files Files Community

JustKiddo commited on Dec 12, 2024

Commit

7f79d8b

verified ·

1 Parent(s): dc14176

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -52

app.py CHANGED Viewed

@@ -4,54 +4,79 @@ import torch
 from transformers import AutoTokenizer, AutoModel
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
-# Get the port from Heroku environment, default to 8501 for local development
-PORT = int(os.environ.get('PORT', 8501))
-class LazyLoadModel:
-    def __init__(self, model_name='intfloat/multilingual-e5-small'):
-        self.model_name = model_name
-        self._tokenizer = None
-        self._model = None
-    @property
-    def tokenizer(self):
-        if self._tokenizer is None:
-            print("Loading tokenizer...")
-            self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        return self._tokenizer
-    @property
-    def model(self):
-        if self._model is None:
-            print("Loading model...")
-            # Use float16 to reduce memory and potentially speed up loading
-            self._model = AutoModel.from_pretrained(self.model_name, torch_dtype=torch.float16)
-        return self._model
 class VietnameseChatbot:
-    def __init__(self):
         """
-        Initialize the Vietnamese chatbot with lazy-loaded model
         """
-        self.model_loader = LazyLoadModel()
-        # Very minimal conversation data to reduce startup time
-        self.conversation_data = [
-            {"query": "Xin chào", "response": "Chào bạn!"},
-            {"query": "Bạn là ai?", "response": "Tôi là trợ lý AI."},
         ]
     def embed_text(self, text):
         """
         Generate embeddings for input text
         """
         try:
             # Tokenize and generate embeddings
-            inputs = self.model_loader.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
             with torch.no_grad():
-                model_output = self.model_loader.model(**inputs)
             # Mean pooling
             embeddings = self.mean_pooling(model_output, inputs['attention_mask'])
@@ -59,7 +84,7 @@ class VietnameseChatbot:
         except Exception as e:
             print(f"Embedding error: {e}")
             return None
     def mean_pooling(self, model_output, attention_mask):
         """
         Perform mean pooling on model output
@@ -67,7 +92,7 @@ class VietnameseChatbot:
         token_embeddings = model_output[0]
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
     def get_response(self, user_query):
         """
         Find the most similar response from conversation data
@@ -77,15 +102,10 @@ class VietnameseChatbot:
             query_embedding = self.embed_text(user_query)
             if query_embedding is None:
-                return "Xin lỗi, đã có lỗi xảy ra."
-            # Embed conversation data
-            conversation_embeddings = np.array([
-                self.embed_text(item['query'])[0] for item in self.conversation_data
-            ])
             # Calculate cosine similarities
-            similarities = cosine_similarity(query_embedding, conversation_embeddings)[0]
             # Find most similar response
             best_match_index = np.argmax(similarities)
@@ -94,26 +114,33 @@ class VietnameseChatbot:
             if similarities[best_match_index] > 0.5:
                 return self.conversation_data[best_match_index]['response']
-            return "Xin lỗi, tôi không hiểu câu hỏi của bạn."
         except Exception as e:
             print(f"Response generation error: {e}")
             return "Đã xảy ra lỗi. Xin vui lòng thử lại."
 def main():
-    # Server configuration to use Heroku-assigned port
-    if 'PORT' in os.environ:
-        #st.set_option('server.port', PORT)
-        print(f"Server starting on port {PORT}")
     st.title("🤖 Trợ Lý AI Tiếng Việt")
-    # Initialize chatbot
     chatbot = VietnameseChatbot()
     # Chat history in session state
     if 'messages' not in st.session_state:
         st.session_state.messages = []
     # Display chat messages
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
@@ -138,8 +165,5 @@ def main():
         # Add assistant message to chat history
         st.session_state.messages.append({"role": "assistant", "content": response})
-# Logging for Heroku diagnostics
-print("Chatbot application is initializing...")
 if __name__ == "__main__":
     main()

 from transformers import AutoTokenizer, AutoModel
 import numpy as np
 from sklearn.metrics.pairwise import cosine_similarity
+import json
 class VietnameseChatbot:
+    def __init__(self, model_name='intfloat/multilingual-e5-small'):
         """
+        Initialize the Vietnamese chatbot with pre-loaded model and conversation data
         """
+        # Load pre-trained model and tokenizer
+        print("Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        print("Loading model...")
+        self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16)
+        # Load comprehensive conversation dataset
+        self.conversation_data = self._load_conversation_data()
+        # Pre-compute embeddings for faster response generation
+        print("Pre-computing conversation embeddings...")
+        self.conversation_embeddings = self._precompute_embeddings()
+    def _load_conversation_data(self):
+        """
+        Load a comprehensive conversation dataset
+        """
+        return [
+            # Greeting conversations
+            {"query": "Xin chào", "response": "Chào bạn! Tôi có thể giúp gì cho bạn?"},
+            {"query": "Hi", "response": "Xin chào! Tôi là trợ lý AI tiếng Việt."},
+            {"query": "Chào buổi sáng", "response": "Chào buổi sáng! Chúc bạn một ngày tốt lành."},
+            # Identity and purpose
+            {"query": "Bạn là ai?", "response": "Tôi là trợ lý AI được phát triển để hỗ trợ và trò chuyện bằng tiếng Việt."},
+            {"query": "Bạn từ đâu đến?", "response": "Tôi được phát triển bởi một nhóm kỹ sư AI, và tôn chỉ của tôi là hỗ trợ con người."},
+            # Small talk
+            {"query": "Bạn thích gì?", "response": "Tôi thích học hỏi và giúp đỡ mọi người. Mỗi cuộc trò chuyện là một cơ hội để tôi phát triển."},
+            {"query": "Bạn có thể làm gì?", "response": "Tôi có thể trò chuyện, trả lời câu hỏi, và hỗ trợ bạn trong nhiều tình huống khác nhau."},
+            # Weather and time
+            {"query": "Thời tiết hôm nay thế nào?", "response": "Xin lỗi, tôi không thể cung cấp thông tin thời tiết trực tiếp. Bạn có thể kiểm tra ứng dụng dự báo thời tiết."},
+            {"query": "Bây giờ là mấy giờ?", "response": "Tôi là trợ lý AI, nên không thể xem đồng hồ. Bạn có thể kiểm tra thiết bị của mình."},
+            # Assistance offers
+            {"query": "Tôi cần trợ giúp", "response": "Tôi sẵn sàng hỗ trợ bạn. Bạn cần giúp gì?"},
+            {"query": "Giúp tôi với cái gì đó", "response": "Vâng, tôi có thể hỗ trợ bạn. Hãy cho tôi biết chi tiết hơn."},
+            # Farewell
+            {"query": "Tạm biệt", "response": "Hẹn gặp lại! Chúc bạn một ngày tốt đẹp."},
+            {"query": "Bye", "response": "Tạm biệt! Rất vui được trò chuyện với bạn."},
         ]
+    def _precompute_embeddings(self):
+        """
+        Pre-compute embeddings for all conversation queries
+        """
+        embeddings = []
+        for item in self.conversation_data:
+            embedding = self.embed_text(item['query'])
+            if embedding is not None:
+                embeddings.append(embedding[0])
+        return np.array(embeddings)
     def embed_text(self, text):
         """
         Generate embeddings for input text
         """
         try:
             # Tokenize and generate embeddings
+            inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
             with torch.no_grad():
+                model_output = self.model(**inputs)
             # Mean pooling
             embeddings = self.mean_pooling(model_output, inputs['attention_mask'])
         except Exception as e:
             print(f"Embedding error: {e}")
             return None
     def mean_pooling(self, model_output, attention_mask):
         """
         Perform mean pooling on model output
         token_embeddings = model_output[0]
         input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
     def get_response(self, user_query):
         """
         Find the most similar response from conversation data
             query_embedding = self.embed_text(user_query)
             if query_embedding is None:
+                return "Xin lỗi, đã có lỗi xảy ra khi phân tích câu hỏi của bạn."
             # Calculate cosine similarities
+            similarities = cosine_similarity(query_embedding, self.conversation_embeddings)[0]
             # Find most similar response
             best_match_index = np.argmax(similarities)
             if similarities[best_match_index] > 0.5:
                 return self.conversation_data[best_match_index]['response']
+            return "Xin lỗi, tôi chưa hiểu rõ câu hỏi của bạn. Bạn có thể diễn đạt lại được không?"
         except Exception as e:
             print(f"Response generation error: {e}")
             return "Đã xảy ra lỗi. Xin vui lòng thử lại."
 def main():
+    st.set_page_config(
+        page_title="Trợ Lý AI Tiếng Việt",
+        page_icon="🤖",
+    )
     st.title("🤖 Trợ Lý AI Tiếng Việt")
+    st.caption("Trò chuyện với trợ lý AI được phát triển bằng mô hình đa ngôn ngữ")
+    # Initialize chatbot (this will pre-load models and embeddings)
     chatbot = VietnameseChatbot()
     # Chat history in session state
     if 'messages' not in st.session_state:
         st.session_state.messages = []
+    # Sidebar for additional information
+    with st.sidebar:
+        st.header("Về Trợ Lý AI")
+        st.write("Đây là một trợ lý AI được phát triển để hỗ trợ trò chuyện bằng tiếng Việt.")
+        st.write("Mô hình sử dụng: intfloat/multilingual-e5-small")
     # Display chat messages
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
         # Add assistant message to chat history
         st.session_state.messages.append({"role": "assistant", "content": response})
 if __name__ == "__main__":
     main()