Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update app.py
Browse files
app.py
CHANGED
@@ -224,23 +224,23 @@ async def chat(query,history,sources,reports,subtype, client_ip=None, session_id
|
|
224 |
async for update in process_stream():
|
225 |
yield update
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
|
245 |
else:
|
246 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|
|
|
224 |
async for update in process_stream():
|
225 |
yield update
|
226 |
|
227 |
+
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
228 |
+
chat_model = dedicated_endpoint()
|
229 |
+
async def process_stream():
|
230 |
+
# Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
231 |
+
# instead of modifying the one from the outer scope.
|
232 |
+
nonlocal answer_yet # Use the outer scope's answer_yet variable
|
233 |
+
# Iterate over the streaming response chunks
|
234 |
+
async for chunk in chat_model.astream(messages):
|
235 |
+
token = chunk.content
|
236 |
+
answer_yet += token
|
237 |
+
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
238 |
+
history[-1] = (query, parsed_answer)
|
239 |
+
yield [tuple(x) for x in history], docs_html
|
240 |
+
|
241 |
+
# Stream the response updates
|
242 |
+
async for update in process_stream():
|
243 |
+
yield update
|
244 |
|
245 |
else:
|
246 |
chat_model = serverless_api() # TESTING: ADAPTED FOR HF INFERENCE API (needs to be reverted for production version)
|