Spaces:

sugiv
/

LeetMonkey_8Bit_GGUF_Stream_Tokens

Sleeping

App Files Files Community

sugiv commited on Sep 8, 2024

Commit

e1b0723

1 Parent(s): 5b484f5

Leetmonkey In Action via Inference

Browse files

Files changed (1) hide show

app.py +20 -9

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from typing import Generator
 from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -37,6 +38,7 @@ generation_kwargs = {
     "repeat_penalty": 1.1
 }
 def download_model(model_name: str) -> str:
     logger.info(f"Downloading model: {model_name}")
     model_path = hf_hub_download(
@@ -51,15 +53,21 @@ def download_model(model_name: str) -> str:
 # Download and load the 8-bit model at startup
 model_path = download_model(MODEL_NAME)
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,
-    n_threads=4,
-    n_gpu_layers=-1,  # Use all available GPU layers
-    verbose=False
-)
 logger.info("8-bit model loaded successfully")
 def generate_solution(instruction: str) -> str:
     system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
     full_prompt = f"""### Instruction:
@@ -127,6 +135,7 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
         raise HTTPException(status_code=401, detail="Invalid token")
 @app.post("/generate_solution")
 async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
     logger.info("Generating solution")
     generated_output = generate_solution(request.instruction)
@@ -135,6 +144,7 @@ async def generate_solution_api(request: ProblemRequest, authorized: bool = Depe
     return {"solution": formatted_code}
 @app.post("/stream_solution")
 async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
     async def generate():
         logger.info("Streaming solution")
@@ -154,7 +164,7 @@ Here's the complete Python function implementation:
         generated_text = ""
         for chunk in llm(full_prompt, stream=True, **generation_kwargs):
-            token = chunk["choices"][0]["text"]
             generated_text += token
             yield token
@@ -166,6 +176,7 @@ Here's the complete Python function implementation:
 # Gradio wrapper for FastAPI
 def gradio_wrapper(app):
     def inference(instruction, token):
         import requests
         url = "http://localhost:8000/generate_solution"
@@ -197,4 +208,4 @@ if __name__ == "__main__":
     # Launch Gradio interface
     iface = gradio_wrapper(app)
-    iface.launch(share=True)

 from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
+import spaces
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     "repeat_penalty": 1.1
 }
+@spaces.GPU
 def download_model(model_name: str) -> str:
     logger.info(f"Downloading model: {model_name}")
     model_path = hf_hub_download(
 # Download and load the 8-bit model at startup
 model_path = download_model(MODEL_NAME)
+@spaces.GPU
+def load_model(model_path):
+    return Llama(
+        model_path=model_path,
+        n_ctx=2048,
+        n_threads=4,
+        n_gpu_layers=-1,  # Use all available GPU layers
+        verbose=False
+    )
+llm = load_model(model_path)
 logger.info("8-bit model loaded successfully")
+@spaces.GPU
 def generate_solution(instruction: str) -> str:
     system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
     full_prompt = f"""### Instruction:
         raise HTTPException(status_code=401, detail="Invalid token")
 @app.post("/generate_solution")
+@spaces.GPU
 async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
     logger.info("Generating solution")
     generated_output = generate_solution(request.instruction)
     return {"solution": formatted_code}
 @app.post("/stream_solution")
+@spaces.GPU
 async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
     async def generate():
         logger.info("Streaming solution")
         generated_text = ""
         for chunk in llm(full_prompt, stream=True, **generation_kwargs):
+            token = chunk["choices"]["text"]
             generated_text += token
             yield token
 # Gradio wrapper for FastAPI
 def gradio_wrapper(app):
+    @spaces.GPU
     def inference(instruction, token):
         import requests
         url = "http://localhost:8000/generate_solution"
     # Launch Gradio interface
     iface = gradio_wrapper(app)
+    iface.launch(share=True)