Spaces:
Sleeping
Sleeping
Luke Stanley
Avoid unneeded imports, make serverless output more sensible, removing some debugging and comments
469f650
from os import environ as env | |
from os import system as run | |
from subprocess import check_output | |
import gradio as gr | |
def inference_binary_check(): | |
# Without a GPU, we need to re-install llama-cpp-python to avoid an error. | |
# We use a shell command to detect if we have an NVIDIA GPU available: | |
use_gpu = True | |
try: | |
command = "nvidia-debugdump --list|grep Device" | |
output = str(check_output(command, shell=True).decode()) | |
if "NVIDIA" in output and "ID" in output: | |
print("NVIDIA GPU detected.") | |
except Exception as e: | |
print("No NVIDIA GPU detected, using CPU. GPU check result:", e) | |
use_gpu = False | |
if use_gpu: | |
print("GPU detected, existing GPU focused llama-cpp-python should work.") | |
else: | |
print("Avoiding error by re-installing non-GPU llama-cpp-python build because no GPU was detected.") | |
run('pip uninstall llama-cpp-python -y') | |
run('pip install git+https://github.com/lukestanley/llama-cpp-python.git@expose_json_grammar_convert_function --upgrade --no-cache-dir --force-reinstall') | |
print("llama-cpp-python re-installed, will now attempt to load.") | |
LLM_WORKER = env.get("LLM_WORKER", "runpod") | |
if LLM_WORKER == "http" or LLM_WORKER == "in_memory": | |
inference_binary_check() | |
# Now chill can import llama-cpp-python without an error: | |
from chill import improvement_loop | |
def chill_out(text): | |
print("Got this input:", text) | |
return str(improvement_loop(text)) | |
demo = gr.Interface(fn=chill_out, inputs="text", outputs="text") | |
demo.launch(max_threads=1, share=True) |