FESG1234 commited on
Commit
0b2c8a7
·
verified ·
1 Parent(s): b37e700

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -164
app.py CHANGED
@@ -1,192 +1,138 @@
1
  import torch
2
- from transformers import pipeline
 
 
 
3
  import gradio as gr
4
  import PyPDF2
5
  import os
6
  from huggingface_hub import login
 
7
 
8
- # Space configuration
9
  SPACE_DIR = os.environ.get("HF_HOME", os.getcwd())
 
 
 
 
10
 
 
11
  def init_huggingface_auth():
12
- """Space-friendly authentication"""
13
  token = os.getenv("HUGGINGFACE_TOKEN")
14
-
15
- if not token:
16
- print("No HF token found in environment")
17
- return False
18
-
19
- try:
20
- login(token=token, add_to_git_credential=False)
21
- print("HF authentication successful")
22
- return True
23
- except Exception as e:
24
- print(f"Login error: {e}")
25
- return False
26
 
27
  if not init_huggingface_auth():
28
- print("Warning: Authentication failed")
29
 
30
- # Load and preprocess the PDF content
31
- pdf_path = os.path.join(SPACE_DIR, "LTDOCS.pdf")
32
- with open(pdf_path, 'rb') as file:
33
- pdf_reader = PyPDF2.PdfReader(file)
34
- pdf_content = ' '.join([page.extract_text() for page in pdf_reader.pages])
35
- pdf_content = pdf_content.lower().strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Initialize the pipeline
38
- pipe = pipeline(
39
- "text-generation",
40
- model="google/gemma-2-2b-jpn-it",
41
- model_kwargs={"torch_dtype": torch.bfloat16},
42
- device="cpu", # replace with "mps" to run on a Mac device
43
- )
44
 
45
- # System prompt and welcome message
46
- SYSTEM_PROMPT = f"""You Foton the chat bot assistant of the Company Lugha taussi, an AI language assistant specialized in African languages, with a focus on Swahili. Your primary tasks are:
47
- 1. Providing accurate translations between Swahili and other languages
48
- 2. Teaching Swahili vocabulary and grammar
49
- 3. Explaining cultural context behind Swahili expressions
50
- 4. Helping users practice Swahili conversation
51
- 5. Based on the programing doc for lughah Tausi Programing which is in swahili , the following information is relevant: {pdf_content} .assist users in programing and installing lugha tausi programing language"
52
 
53
- Always maintain a friendly and patient demeanor, and provide cultural context when relevant speak mostly swahili and change when asked.
54
- """
55
 
56
- WELCOME_MESSAGE = "**Karibu Lugha Tausi!** Mimi ni Foton, msaidizi wako wa kibinafsi wa Kiswahili. Niko hapa kukusaidia kujifunza, kuelewa, na kuzungumza Kiswahili. **Ninaweza kukusaidiaje leo?** Hebu tuanze! 😊"
 
57
 
58
- # CSS for custom styling
59
- CUSTOM_CSS = """
60
- .container {
61
- max-width: 800px;
62
- margin: auto;
63
- padding: 20px;
64
- }
65
- .header {
66
- text-align: center;
67
- margin-bottom: 30px;
68
- }
69
- .icon {
70
- width: 80px;
71
- height: 80px;
72
- margin: 0 auto 15px;
73
- display: block;
74
- }
75
- .title {
76
- font-size: 2.5em;
77
- font-weight: bold;
78
- margin-bottom: 10px;
79
- }
80
- .description {
81
- font-size: 1.2em;
82
- color: #666;
83
- margin-bottom: 20px;
84
- }
85
- """
86
 
87
- def format_chat_message(messages, system_prompt=SYSTEM_PROMPT):
88
- """Format the chat messages with system prompt"""
89
- formatted_prompt = f"{system_prompt}\n\n"
 
 
90
 
91
- for message in messages:
92
- if isinstance(message, tuple):
93
- role, content = message
94
- if role == "user":
95
- formatted_prompt += f"User: {content}\nLugha Tausi: "
96
- elif role == "assistant":
97
- formatted_prompt += f"{content}\n"
98
 
99
- return formatted_prompt
100
-
101
- def chat_response(message, history):
102
- """Generate response for Gradio chat interface"""
103
- messages = []
104
- for user_msg, bot_msg in history:
105
- messages.append(("user", user_msg))
106
- messages.append(("assistant", bot_msg))
107
- messages.append(("user", message))
108
-
109
- formatted_input = format_chat_message(messages)
110
- outputs = pipe(
111
- formatted_input,
112
- return_full_text=False,
113
- max_new_tokens=256,
114
- temperature=0.1,
115
- top_p=0.9,
116
- do_sample=True
117
  )
118
- return outputs[0]["generated_text"].strip()
 
 
 
119
 
120
- # Create Gradio interface with custom theme and styling
121
- with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
122
- with gr.Column(elem_classes="container"):
123
- # Header section with icon, title and description
124
- with gr.Column(elem_classes="header"):
125
- gr.Image(
126
- os.path.join(SPACE_DIR, "foton.webp"),
127
- elem_classes="icon",
128
- show_label=False, # Removes the label
129
- interactive=False, # Disables interaction including download
130
- )
131
- gr.Markdown("# Foton - Lugha Tausi Assistant", elem_classes="title")
132
- gr.Markdown(
133
- """Your personal Swahili language assistant, powered by AI.
134
- Specialized in translations, teaching, and cultural context.
135
- Let's explore the beauty of Swahili together! 🌍✨""",
136
- elem_classes="description"
137
- )
138
-
139
- # Chat interface
140
- chatbot = gr.Chatbot(
141
- value=[(None, WELCOME_MESSAGE)],
142
- height=500,
143
- show_label=False,
144
- elem_classes="chatbox"
145
- )
146
-
147
- with gr.Row():
148
- msg = gr.Textbox(
149
- placeholder="Type your message here...",
150
- show_label=False,
151
- scale=9
152
  )
153
- clear = gr.Button("Clear Chat", scale=1)
154
-
155
- def user_input(message, history):
156
- return "", history + [(message, None)]
157
-
158
- def bot_response(history):
159
- if len(history) == 0:
160
- history.append((None, WELCOME_MESSAGE))
161
- return history
162
-
163
- user_message = history[-1][0]
164
- bot_message = chat_response(user_message, history[:-1])
165
- history[-1] = (user_message, bot_message)
166
- return history
167
-
168
- def clear_chat():
169
- return [], [(None, WELCOME_MESSAGE)]
170
-
171
- # Set up the message flow
172
- msg.submit(
173
- user_input,
174
- [msg, chatbot],
175
- [msg, chatbot],
176
- queue=False
177
- ).then(
178
- bot_response,
179
- chatbot,
180
- chatbot
181
  )
 
 
 
 
 
 
 
182
 
183
- clear.click(
184
- clear_chat,
185
- None,
186
- [chatbot],
187
- queue=False
188
- )
189
 
190
- # Launch the interface
191
  if __name__ == "__main__":
192
- demo.launch(share=True, ssr_mode=False)
 
1
  import torch
2
+ from transformers import pipeline, AutoTokenizer, AutoModel
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import FAISS
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
  import gradio as gr
7
  import PyPDF2
8
  import os
9
  from huggingface_hub import login
10
+ from typing import List, Tuple
11
 
12
+ # Configuration
13
  SPACE_DIR = os.environ.get("HF_HOME", os.getcwd())
14
+ PDF_PATH = os.path.join(SPACE_DIR, "LTDOCS.pdf")
15
+ EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
16
+ MODEL_NAME = "google/gemma-2-2b-jpn-it"
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
 
19
+ # Authentification HuggingFace
20
  def init_huggingface_auth():
 
21
  token = os.getenv("HUGGINGFACE_TOKEN")
22
+ if token:
23
+ try:
24
+ login(token=token, add_to_git_credential=False)
25
+ print("Authentification HF réussie")
26
+ return True
27
+ except Exception as e:
28
+ print(f"Erreur d'authentification: {e}")
29
+ return False
 
 
 
 
30
 
31
  if not init_huggingface_auth():
32
+ print("Avertissement: Authentification échouée")
33
 
34
+ # Chargement et traitement du PDF
35
+ def load_and_process_pdf() -> List[str]:
36
+ with open(PDF_PATH, 'rb') as file:
37
+ pdf_reader = PyPDF2.PdfReader(file)
38
+ text = "\n".join([page.extract_text() for page in pdf_reader.pages])
39
+
40
+ text_splitter = RecursiveCharacterTextSplitter(
41
+ chunk_size=512,
42
+ chunk_overlap=128,
43
+ length_function=len,
44
+ separators=["\n\n", "\n", ".", "!", "?", ";", ",", " "]
45
+ )
46
+ return text_splitter.split_text(text)
47
+
48
+ # Initialisation des modèles
49
+ def initialize_models():
50
+ embeddings = HuggingFaceEmbeddings(
51
+ model_name=EMBEDDING_MODEL,
52
+ model_kwargs={'device': DEVICE},
53
+ encode_kwargs={'normalize_embeddings': True}
54
+ )
55
+
56
+ chunks = load_and_process_pdf()
57
+ vector_store = FAISS.from_texts(chunks, embeddings)
58
+
59
+ generator = pipeline(
60
+ "text-generation",
61
+ model=MODEL_NAME,
62
+ tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
63
+ model_kwargs={"torch_dtype": torch.bfloat16},
64
+ device=DEVICE
65
+ )
66
+
67
+ return vector_store, generator
68
 
69
+ vector_store, generator = initialize_models()
 
 
 
 
 
 
70
 
71
+ # Prompt engineering
72
+ SYSTEM_PROMPT = """Vous êtes Foton, assistant virtuel expert en programmation Lugha Tausi.
73
+ Répondez en swahili sauf demande contraire. Basez-vous strictement sur la documentation fournie.
 
 
 
 
74
 
75
+ Documentation:
76
+ {context}
77
 
78
+ Question: {question}
79
+ Réponse:"""
80
 
81
+ WELCOME_MESSAGE = "**Karibu Lugha Tausi!** Mimi ni Foton, msaidizi wako wa kibinafsi. Niko hapa kukusaidia kwa masuala yoyote ya programu. **Ninaweza kukusaidiaje leo?**"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ # Fonction de génération améliorée
84
+ def rag_response(query: str, history: List[Tuple[str, str]] = []) -> str:
85
+ # Recherche contextuelle
86
+ docs = vector_store.similarity_search(query, k=3)
87
+ context = "\n".join([d.page_content for d in docs])
88
 
89
+ # Construction du prompt
90
+ messages = [{"role": "user", "content": SYSTEM_PROMPT.format(context=context, question=query)}]
 
 
 
 
 
91
 
92
+ # Génération avec contrôle de qualité
93
+ response = generator(
94
+ messages,
95
+ max_new_tokens=512,
96
+ temperature=0.3,
97
+ top_p=0.95,
98
+ repetition_penalty=1.1,
99
+ do_sample=True,
100
+ num_return_sequences=1
 
 
 
 
 
 
 
 
 
101
  )
102
+
103
+ # Post-traitement
104
+ answer = response[0]['generated_text'].split("Réponse:")[-1].strip()
105
+ return answer
106
 
107
+ # Interface Gradio améliorée
108
+ with gr.Blocks(theme=gr.themes.Soft(), css=gr.themes.Soft()._get_theme_css()) as demo:
109
+ gr.Markdown("# Foton - Msaidizi wa Lugha Tausi")
110
+
111
+ with gr.Row():
112
+ with gr.Column(scale=2):
113
+ gr.Image("foton.webp", label="Foton", width=200)
114
+ with gr.Column(scale=8):
115
+ chatbot = gr.Chatbot(
116
+ value=[(None, WELCOME_MESSAGE)],
117
+ bubble_full_width=False,
118
+ height=600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  )
120
+
121
+ msg = gr.Textbox(
122
+ placeholder="Andika ujumbe wako hapa...",
123
+ label="Pitia swali lako",
124
+ container=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  )
126
+
127
+ clear = gr.Button("Safisha Mazungumzo")
128
+
129
+ def respond(message, chat_history):
130
+ response = rag_response(message)
131
+ chat_history.append((message, response))
132
+ return "", chat_history
133
 
134
+ msg.submit(respond, [msg, chatbot], [msg, chatbot])
135
+ clear.click(lambda: None, None, chatbot, queue=False)
 
 
 
 
136
 
 
137
  if __name__ == "__main__":
138
+ demo.launch(server_name="0.0.0.0", server_port=7860)