Spaces:
Sleeping
Sleeping
added option to disable streaming output
Browse files
app.py
CHANGED
@@ -76,7 +76,8 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
|
76 |
with gr.Accordion("Advanced Settings", open=False):
|
77 |
with gr.Row(equal_height = True):
|
78 |
llm_model = gr.Dropdown(choices = ['mistralai/Mixtral-8x7B-Instruct-v0.1','mistralai/Mistral-7B-Instruct-v0.2', 'None'], value = 'mistralai/Mistral-7B-Instruct-v0.2', label = 'LLM Model')
|
79 |
-
llm_results = gr.Slider(minimum=4, maximum=10, value=5, step=1, interactive=True, label="Top n results
|
|
|
80 |
|
81 |
output_text = gr.Textbox(show_label = True, container = True, label = 'LLM Answer', visible = True, placeholder = output_placeholder)
|
82 |
input = gr.Textbox(show_label = False, visible = False)
|
@@ -100,31 +101,35 @@ with gr.Blocks(theme = gr.themes.Soft()) as demo:
|
|
100 |
prompt = get_prompt_text(message, '\n\n'.join(rag_cleaner(out) for out in rag_out[:llm_results_use]))
|
101 |
return md_text_updated, prompt
|
102 |
|
103 |
-
def ask_llm(prompt, llm_model_picked = 'mistralai/Mistral-7B-Instruct-v0.2'):
|
104 |
model_disabled_text = "LLM Model is disabled"
|
105 |
output = ""
|
106 |
if llm_model_picked == 'None':
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
112 |
client = InferenceClient(llm_model_picked)
|
113 |
-
#output = client.text_generation(prompt, **generate_kwargs, stream=False, details=False, return_full_text=False)
|
114 |
try:
|
115 |
-
stream = client.text_generation(prompt, **generate_kwargs, stream=
|
116 |
except:
|
117 |
gr.Warning("LLM Inference rate limit reached, try again later!")
|
118 |
return ""
|
119 |
#output = output.lstrip(' \n') if output.lstrip().startswith('\n') else output
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
|
122 |
-
for response in stream:
|
123 |
-
output += response.token.text
|
124 |
-
yield output
|
125 |
-
return output
|
126 |
-
#return gr.Textbox(output, visible = True)
|
127 |
|
128 |
-
msg.submit(update_with_rag_md, [msg, llm_results], [gr_md, input]).success(ask_llm, [input, llm_model], output_text)
|
129 |
|
130 |
-
demo.
|
|
|
76 |
with gr.Accordion("Advanced Settings", open=False):
|
77 |
with gr.Row(equal_height = True):
|
78 |
llm_model = gr.Dropdown(choices = ['mistralai/Mixtral-8x7B-Instruct-v0.1','mistralai/Mistral-7B-Instruct-v0.2', 'None'], value = 'mistralai/Mistral-7B-Instruct-v0.2', label = 'LLM Model')
|
79 |
+
llm_results = gr.Slider(minimum=4, maximum=10, value=5, step=1, interactive=True, label="Top n results as context")
|
80 |
+
stream_results = gr.Checkbox(value = True, label = "Stream output")
|
81 |
|
82 |
output_text = gr.Textbox(show_label = True, container = True, label = 'LLM Answer', visible = True, placeholder = output_placeholder)
|
83 |
input = gr.Textbox(show_label = False, visible = False)
|
|
|
101 |
prompt = get_prompt_text(message, '\n\n'.join(rag_cleaner(out) for out in rag_out[:llm_results_use]))
|
102 |
return md_text_updated, prompt
|
103 |
|
104 |
+
def ask_llm(prompt, llm_model_picked = 'mistralai/Mistral-7B-Instruct-v0.2', stream_outputs = False):
|
105 |
model_disabled_text = "LLM Model is disabled"
|
106 |
output = ""
|
107 |
if llm_model_picked == 'None':
|
108 |
+
if stream_outputs:
|
109 |
+
for out in model_disabled_text:
|
110 |
+
output += out
|
111 |
+
yield output
|
112 |
+
return output
|
113 |
+
else:
|
114 |
+
return model_disabled_text
|
115 |
+
|
116 |
client = InferenceClient(llm_model_picked)
|
|
|
117 |
try:
|
118 |
+
stream = client.text_generation(prompt, **generate_kwargs, stream=stream_outputs, details=False, return_full_text=False)
|
119 |
except:
|
120 |
gr.Warning("LLM Inference rate limit reached, try again later!")
|
121 |
return ""
|
122 |
#output = output.lstrip(' \n') if output.lstrip().startswith('\n') else output
|
123 |
|
124 |
+
if stream_outputs:
|
125 |
+
for response in stream:
|
126 |
+
output += response
|
127 |
+
yield output
|
128 |
+
return output
|
129 |
+
else:
|
130 |
+
return stream
|
131 |
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
+
msg.submit(update_with_rag_md, [msg, llm_results], [gr_md, input]).success(ask_llm, [input, llm_model, stream_results], output_text)
|
134 |
|
135 |
+
demo.queue(default_concurrency_limit=10).launch()
|