Spaces:

sugiv
/

LeetMonkey_8Bit_GGUF_Stream_Tokens

Sleeping

App Files Files Community

sugiv commited on Sep 8

Commit

3ec3dc0

•

1 Parent(s): 1238dd2

Leetmonkey In Action via Inference

Browse files

Files changed (1) hide show

app.py +14 -65

app.py CHANGED Viewed

@@ -3,17 +3,14 @@ import re
 import logging
 import textwrap
 import autopep8
-import gradio as gr
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 import jwt
-from typing import Generator
 from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
-import spaces
-import torch
-from threading import Thread
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -54,21 +51,15 @@ def download_model(model_name: str) -> str:
 # Download and load the 8-bit model at startup
 model_path = download_model(MODEL_NAME)
-@spaces.GPU
-def load_model(model_path):
-    return Llama(
-        model_path=model_path,
-        n_ctx=2048,
-        n_threads=4,
-        n_gpu_layers=-1,  # Use all available GPU layers
-        verbose=False
-    )
-llm = load_model(model_path)
 logger.info("8-bit model loaded successfully")
-@spaces.GPU
 def generate_solution(instruction: str) -> str:
     system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
     full_prompt = f"""### Instruction:
@@ -145,7 +136,7 @@ async def generate_solution_api(request: ProblemRequest, authorized: bool = Depe
 @app.post("/stream_solution")
 async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
-    async def generate():
         logger.info("Streaming solution")
         system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
         full_prompt = f"""### Instruction:
@@ -163,7 +154,7 @@ Here's the complete Python function implementation:
         generated_text = ""
         for chunk in llm(full_prompt, stream=True, **generation_kwargs):
-            token = chunk["choices"]["text"]
             generated_text += token
             yield token
@@ -171,50 +162,8 @@ Here's the complete Python function implementation:
         logger.info("Solution generated successfully")
         yield formatted_code
-    return generate()
-# Gradio wrapper for FastAPI
-def gradio_wrapper(app):
-    def inference(instruction, token):
-        import requests
-        url = "http://localhost:8000/generate_solution"
-        headers = {"Authorization": f"Bearer {token}"}
-        response = requests.post(url, json={"instruction": instruction}, headers=headers)
-        if response.status_code == 200:
-            return response.json()["solution"]
-        else:
-            return f"Error: {response.status_code}, {response.text}"
-    iface = gr.Interface(
-        fn=inference,
-        inputs=[
-            gr.Textbox(label="LeetCode Problem Instruction"),
-            gr.Textbox(label="JWT Token")
-        ],
-        outputs=gr.Code(label="Generated Solution"),
-        title="LeetCode Problem Solver API",
-        description="Enter a LeetCode problem instruction and your JWT token to generate a solution."
-    )
-    return iface
-@spaces.GPU
-def main():
-    # Verify GPU availability
-    zero = torch.Tensor().cuda()
-    print(f"GPU availability: {zero.device}")
-    # Download and load the model
-    model_path = download_model(MODEL_NAME)
-    global llm
-    llm = load_model(model_path)
-    logger.info("8-bit model loaded successfully")
-    # Start FastAPI in a separate thread
-    Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=8000)).start()
-    # Launch Gradio interface
-    iface = gradio_wrapper(app)
-    iface.launch(share=True)
 if __name__ == "__main__":
-    main()

 import logging
 import textwrap
 import autopep8
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 import jwt
+from typing import AsyncGenerator
 from fastapi import FastAPI, HTTPException, Depends
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel
+from fastapi.responses import StreamingResponse
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 # Download and load the 8-bit model at startup
 model_path = download_model(MODEL_NAME)
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,
+    n_threads=4,
+    n_gpu_layers=-1,  # Use all available GPU layers
+    verbose=False
+)
 logger.info("8-bit model loaded successfully")
 def generate_solution(instruction: str) -> str:
     system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
     full_prompt = f"""### Instruction:
 @app.post("/stream_solution")
 async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
+    async def generate() -> AsyncGenerator[str, None]:
         logger.info("Streaming solution")
         system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
         full_prompt = f"""### Instruction:
         generated_text = ""
         for chunk in llm(full_prompt, stream=True, **generation_kwargs):
+            token = chunk["choices"][0]["text"]
             generated_text += token
             yield token
         logger.info("Solution generated successfully")
         yield formatted_code
+    return StreamingResponse(generate(), media_type="text/plain")
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)