Spaces:

lukestanley
/

ChillTranslator

Sleeping

App Files Files Community

Luke Stanley commited on Feb 28, 2024

Commit

a054519

unverified ·

2 Parent(s): e327a9e c013599

Switch to serverless worker by default (PR #2 from lukestanley/serverless_json_llm)

Browse files

Files changed (9) hide show

.gitignore +1 -0
app.py +32 -22
chill.py +7 -0
docker-compose.yml +11 -0
runpod.dockerfile +31 -0
runpod_handler.py +41 -0
serverless.md +64 -0
serverless_local_test.py +28 -0
utils.py +50 -12

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	.aider*


1	.aider*
2	+ .cache

app.py CHANGED Viewed

@@ -1,34 +1,44 @@
 from os import system as run
 from subprocess import check_output
 import gradio as gr
-# Without a GPU, we need to re-install llama-cpp-python to avoid an error.
-# We use a shell command to detect if we have an NVIDIA GPU available:
-use_gpu = True
-try:
-    command = "nvidia-debugdump --list|grep Device"
-    output = str(check_output(command, shell=True).decode())
-    if "NVIDIA" in output and "ID" in output:
-        print("NVIDIA GPU detected.")
-except Exception as e:
-    print("No NVIDIA GPU detected, using CPU. GPU check result:", e)
-    use_gpu = False
-if use_gpu:
-    print("GPU detected, existing GPU focused llama-cpp-python should work.")
-else:
-    print("Avoiding error by re-installing non-GPU llama-cpp-python build because no GPU was detected.")
-    run('pip uninstall llama-cpp-python -y')
-    run('pip install git+https://github.com/lukestanley/llama-cpp-python.git@expose_json_grammar_convert_function --upgrade --no-cache-dir --force-reinstall')
-    print("llama-cpp-python re-installed, will now attempt to load.")
 # Now chill can import llama-cpp-python without an error:
 from chill import improvement_loop
-def greet(text):
     return str(improvement_loop(text))
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch(max_threads=1)

+from os import environ as env
 from os import system as run
 from subprocess import check_output
 import gradio as gr
+def inference_binary_check():
+    # Without a GPU, we need to re-install llama-cpp-python to avoid an error.
+    # We use a shell command to detect if we have an NVIDIA GPU available:
+    use_gpu = True
+    try:
+        command = "nvidia-debugdump --list|grep Device"
+        output = str(check_output(command, shell=True).decode())
+        if "NVIDIA" in output and "ID" in output:
+            print("NVIDIA GPU detected.")
+    except Exception as e:
+        print("No NVIDIA GPU detected, using CPU. GPU check result:", e)
+        use_gpu = False
+    if use_gpu:
+        print("GPU detected, existing GPU focused llama-cpp-python should work.")
+    else:
+        print("Avoiding error by re-installing non-GPU llama-cpp-python build because no GPU was detected.")
+        run('pip uninstall llama-cpp-python -y')
+        run('pip install git+https://github.com/lukestanley/llama-cpp-python.git@expose_json_grammar_convert_function --upgrade --no-cache-dir --force-reinstall')
+        print("llama-cpp-python re-installed, will now attempt to load.")
+LLM_WORKER = env.get("LLM_WORKER", "runpod")
+if LLM_WORKER == "http" or LLM_WORKER == "in_memory":
+    inference_binary_check()
 # Now chill can import llama-cpp-python without an error:
 from chill import improvement_loop
+def chill_out(text):
+    print("Got this input:", text)
     return str(improvement_loop(text))
+demo = gr.Interface(fn=chill_out, inputs="text", outputs="text")
+demo.launch(max_threads=1, share=True)

chill.py CHANGED Viewed

@@ -114,6 +114,13 @@ def print_iteration_result(iteration, overall_score, time_used):
 def improvement_loop(input_text):
     global original_text
     global last_edit
     original_text = input_text
     for iteration in range(1, max_iterations + 1):

 def improvement_loop(input_text):
     global original_text
     global last_edit
+    global suggestions
+    global start_time
+    global max_iterations
+    suggestions = []
+    last_edit = ""
+    start_time = time.time()
+    max_iterations = 20
     original_text = input_text
     for iteration in range(1, max_iterations + 1):

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+version: '3.8'
+services:
+  runpod:
+    build:
+      context: .
+      dockerfile: runpod.dockerfile
+    volumes:
+      - ./.cache:/runpod-volume/.cache
+      - ./test.sh:/test.sh
+    command: /test.sh
+    entrypoint: /usr/bin/python3

runpod.dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# Base image -> https://github.com/runpod/containers/blob/main/official-templates/base/Dockerfile
+# DockerHub -> https://hub.docker.com/r/runpod/base/tags
+FROM runpod/base:0.4.0-cuda11.8.0
+# Base image sets HuggingFace cache directory to use Runpod's shared cache for efficiency:
+ENV HF_HOME="/runpod-volume/.cache/huggingface/"
+# Also pre-downloading models may speed up start times while
+# increasing image size, but could be worth it for some use cases.
+RUN python3.11 -m pip install --upgrade pip && \
+    python3.11 -m pip install runpod==1.6.0
+RUN python3.11 -m pip install pytest cmake \
+    scikit-build setuptools pydantic-settings \
+    huggingface_hub hf_transfer \
+    pydantic pydantic_settings \
+    llama-cpp-python
+# Install llama-cpp-python (build with cuda)
+ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+RUN python3.11 -m pip install git+https://github.com/lukestanley/llama-cpp-python.git@expose_json_grammar_convert_function --upgrade --no-cache-dir --force-reinstall
+ADD runpod_handler.py .
+ADD chill.py .
+ADD utils.py .
+ADD promptObjects.py .
+ENV REPO_ID="TheBloke/phi-2-GGUF"
+ENV MODEL_FILE="phi-2.Q2_K.gguf"
+CMD python3.11 -u /runpod_handler.py

runpod_handler.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import runpod
+from os import environ as env
+import json
+from pydantic import BaseModel, Field
+class Movie(BaseModel):
+    title: str = Field(..., title="The title of the movie")
+    year: int = Field(..., title="The year the movie was released")
+    director: str = Field(..., title="The director of the movie")
+    genre: str = Field(..., title="The genre of the movie")
+    plot:  str = Field(..., title="Plot summary of the movie")
+def pydantic_model_to_json_schema(pydantic_model_class):
+    schema = pydantic_model_class.model_json_schema()
+    # Optional example field from schema, is not needed for the grammar generation
+    if "example" in schema:
+        del schema["example"]
+    json_schema = json.dumps(schema)
+    return json_schema
+default_schema_example = """{ "title": ..., "year": ..., "director": ..., "genre": ..., "plot":...}"""
+default_schema = pydantic_model_to_json_schema(Movie)
+default_prompt = f"Instruct: \nOutput a JSON object in this format: {default_schema_example} for the following movie: The Matrix\nOutput:\n"
+from utils import llm_stream_sans_network_simple
+def handler(job):
+    """ Handler function that will be used to process jobs. """
+    job_input = job['input']
+    filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf")
+    prompt = job_input.get('prompt', default_prompt)
+    schema = job_input.get('schema', default_schema)
+    print("got this input", str(job_input))
+    print("prompt", prompt )
+    print("schema", schema )
+    output = llm_stream_sans_network_simple(prompt, schema)
+    #print("got this output", str(output))
+    return output
+runpod.serverless.start({
+    "handler": handler,
+    #"return_aggregate_stream": True
+})

serverless.md ADDED Viewed

	@@ -0,0 +1,64 @@

+Fast severless GPU inference with RunPod
+==============================
+This partly GPT-4 generated document explains the integration of Runpod with Docker, including testing the Runpod Dockerfile with Docker Compose, building and pushing the image to Docker Hub, and how `app.py` makes use of it. I skimmed it and added stuff to it, as a note to myself and others.
+# Motivation
+Fast inference is useful. Usually an existing hosted provider would be good for this, but I was worried about getting blocked given that we need to translate some spicy text input, the concern is that it could get flagged, and result in accounts being blocked.
+Also I needed something that could infer with JSON typed output, that matches particular schemas, and fast. So I found RunPod's "serverless" GPU, service.
+It can be used by chill.py and app.py, as one of the worker options.
+## Testing with Docker Compose
+To test the Runpod Dockerfile, you can use Docker Compose which simplifies the process of running multi-container Docker applications. Here's how you can test it:
+1. Ensure you have Docker and Docker Compose installed on your system.
+2. Navigate to the directory containing the `docker-compose.yml` file.
+3. Run the following command to build and start the container:
+   ```
+   docker-compose up --build
+   ```
+4. The above command will build the image as defined in `runpod.dockerfile` and start a container with the configuration specified in `docker-compose.yml`, it will automatically run a test, that matches the format expected from the llm_stream_serverless client (in utils.py), though without the network layer in play.
+# Direct testing with Docker, without Docker-Compose:
+Something like this worked for me:
+```sudo docker run --gpus all -it -v "$(pwd)/.cache:/runpod-volume/.cache/huggingface/" lukestanley/test:translate2 bash```
+Note the cache mount. This saves re-downloading the LLMs!
+## Building and Pushing to Docker Hub
+After testing and ensuring that everything works as expected, you can build the Docker image and push it to Docker Hub for deployment. Here are the steps:
+1. Log in to Docker Hub from your command line using `docker login --username [yourusername]`.
+2. Build the Docker image with a tag:
+   ```
+   docker build -t yourusername/yourimagename:tag -f runpod.dockerfile .
+   ```
+3. Once the image is built, push it to Docker Hub:
+   ```
+   docker push yourusername/yourimagename:tag
+   ```
+4. Replace `yourusername`, `yourimagename`, and `tag` with your Docker Hub username, the name you want to give to your image, and the tag respectively.
+# Runpod previsioning:
+You'll need an account on Runpod with credit.
+You'll need a serverless GPU endpoint setting up using your Docker image setup here:
+https://www.runpod.io/console/serverless
+It has a Flashboot feature that seems like Firecracker with GPU support, it might be using Cloud Hypervisor under the hood, currently Firecracker has no GPU support. Fly.io also has something similar, with Cloud Hypervisor.
+You'll need the secret saved somewhere securely. This will likely end up as a securely treated env var for use by app.py later.
+You'll also need the endpoint ID.
+## Runpod Integration in `app.py`
+The `app.py` file is a Gradio interface that makes use of the Runpod integration to perform inference. It checks for the presence of a GPU and installs the appropriate version of `llama-cpp-python`. Depending on the environment variable `LLM_WORKER`, it uses either the Runpod serverless API, an HTTP server, or loads the model into memory for inference.
+The `greet` function in `app.py` calls `improvement_loop` from the `chill` module, which based on an environment variable, will use the Runpod worker, that is used to process the input text and generate improved text based on the model's output.
+The Gradio interface is then launched with `demo.launch()`, making the application accessible via a web interface, which can be shared publicly.
+Note: Ensure that the necessary environment variables such as `LLM_WORKER`, `REPO_ID`, and `MODEL_FILE` are set correctly for the integration to work properly.

serverless_local_test.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/usr/bin/env python3
+import os, json
+# Define your JSON and prompt as Python dictionaries and strings
+schema = {
+    "properties": {
+        "title": {"title": "The title of the movie", "type": "string"},
+        "year": {"title": "The year the movie was released", "type": "integer"},
+        "director": {"title": "The director of the movie", "type": "string"},
+        "genre": {"title": "The genre of the movie", "type": "string"},
+        "plot": {"title": "Plot summary of the movie", "type": "string"}
+    },
+    "required": ["title", "year", "director", "genre", "plot"],
+    "title": "Movie",
+    "type": "object"
+}
+movie ="Toy Story"
+prompt = "Instruct: Output a JSON object in this format: { \"title\": ..., \"year\": ..., \"director\": ..., \"genre\": ..., \"plot\":...} for the following movie: "+movie+"\nOutput:\n"
+# Construct the JSON input string
+json_input = json.dumps({"input": {"schema": json.dumps(schema), "prompt": prompt}})
+print(json_input)
+# Define the command to execute your Python script with the JSON string
+command = f'python3.11 runpod_handler.py --test_input \'{json_input}\''
+# Execute the command
+os.system(command)

utils.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import json
 from os import environ as env
 from typing import Any, Dict, Union
-import requests
 from huggingface_hub import hf_hub_download
-from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
-# There are two ways to use the LLM model currently used:
 # 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
 # when you want to change the logic of the translator without restarting the server.
 # 2. Load the model into memory
@@ -15,28 +15,38 @@ from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
 # to the OpenAI API but adds a unique "grammar" parameter.
 # The real OpenAI API has other ways to set the output format.
 # It's possible to switch to another LLM API by changing the llm_streaming function.
 URL = "http://localhost:5834/v1/chat/completions"
 in_memory_llm = None
-N_GPU_LAYERS = env.get("N_GPU_LAYERS", -1) # Default to -1, which means use all layers if available
-CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 4096))
 LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
-USE_HTTP_SERVER = env.get("USE_HTTP_SERVER", "false").lower() == "true"
 MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
 TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
 if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
     print(f"Using local model from {LLM_MODEL_PATH}")
-else:
     print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
     LLM_MODEL_PATH =hf_hub_download(
         repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
         filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
     )
     print(f"Model downloaded to {LLM_MODEL_PATH}")
-if in_memory_llm is None and USE_HTTP_SERVER is False:
     print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
     in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
@@ -141,9 +151,37 @@ def llm_stream_sans_network(
         json_output = json.loads(output_text)
         return json_output
-def query_ai_prompt(prompt, replacements, model_class, in_memory=True):
     prompt = replace_text(prompt, replacements)
-    if in_memory:
-        return llm_stream_sans_network(prompt, model_class)
-    else:
         return llm_streaming(prompt, model_class)

 import json
 from os import environ as env
 from typing import Any, Dict, Union
+import requests
 from huggingface_hub import hf_hub_download
+# There are 3 ways to use the LLM model currently used:
 # 1. Use the HTTP server (USE_HTTP_SERVER=True), this is good for development
 # when you want to change the logic of the translator without restarting the server.
 # 2. Load the model into memory
 # to the OpenAI API but adds a unique "grammar" parameter.
 # The real OpenAI API has other ways to set the output format.
 # It's possible to switch to another LLM API by changing the llm_streaming function.
+# 3. Use the RunPod API, which is a paid service with severless GPU functions.
+# See serverless.md for more information.
 URL = "http://localhost:5834/v1/chat/completions"
 in_memory_llm = None
+worker_options = ["runpod", "http", "in_memory"]
+LLM_WORKER = env.get("LLM_WORKER", "runpod")
+if LLM_WORKER not in worker_options:
+    raise ValueError(f"Invalid worker: {LLM_WORKER}")
+N_GPU_LAYERS = int(env.get("N_GPU_LAYERS", -1)) # Default to -1, use all layers if available
+CONTEXT_SIZE = int(env.get("CONTEXT_SIZE", 2048))
 LLM_MODEL_PATH = env.get("LLM_MODEL_PATH", None)
 MAX_TOKENS = int(env.get("MAX_TOKENS", 1000))
 TEMPERATURE = float(env.get("TEMPERATURE", 0.3))
+performing_local_inference = (LLM_WORKER == "in_memory" or LLM_WORKER == "http")
 if LLM_MODEL_PATH and len(LLM_MODEL_PATH) > 0:
     print(f"Using local model from {LLM_MODEL_PATH}")
+if performing_local_inference and not LLM_MODEL_PATH:
     print("No local LLM_MODEL_PATH environment variable set. We need a model, downloading model from HuggingFace Hub")
     LLM_MODEL_PATH =hf_hub_download(
         repo_id=env.get("REPO_ID", "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"),
         filename=env.get("MODEL_FILE", "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"),
     )
     print(f"Model downloaded to {LLM_MODEL_PATH}")
+if LLM_WORKER == "http" or LLM_WORKER == "in_memory":
+    from llama_cpp import Llama, LlamaGrammar, json_schema_to_gbnf
+if in_memory_llm is None and LLM_WORKER == "in_memory":
     print("Loading model into memory. If you didn't want this, set the USE_HTTP_SERVER environment variable to 'true'.")
     in_memory_llm = Llama(model_path=LLM_MODEL_PATH, n_ctx=CONTEXT_SIZE, n_gpu_layers=N_GPU_LAYERS, verbose=True)
         json_output = json.loads(output_text)
         return json_output
+def llm_stream_serverless(prompt,model):
+    RUNPOD_ENDPOINT_ID = env.get("RUNPOD_ENDPOINT_ID")
+    RUNPOD_API_KEY = env.get("RUNPOD_API_KEY")
+    url = f"https://api.runpod.ai/v2/{RUNPOD_ENDPOINT_ID}/runsync"
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {RUNPOD_API_KEY}'
+    }
+    schema = model.schema()
+    data = {
+        'input': {
+            'schema': json.dumps(schema),
+            'prompt': prompt
+        }
+    }
+    response = requests.post(url, json=data, headers=headers)
+    result = response.json()
+    print(result)
+    output = result['output'].replace("model:mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf\n", "")
+    # TODO: remove replacement once new version of runpod is deployed
+    return json.loads(output)
+def query_ai_prompt(prompt, replacements, model_class):
     prompt = replace_text(prompt, replacements)
+    if LLM_WORKER == "runpod":
+        return llm_stream_serverless(prompt, model_class)
+    if LLM_WORKER == "http":
         return llm_streaming(prompt, model_class)
+    if LLM_WORKER == "in_memory":
+        return llm_stream_sans_network(prompt, model_class)