Spaces:

sugiv
/

LeetMonkey_8Bit_GGUF_Stream_Tokens

Sleeping

App Files Files Community

sugiv commited on Sep 8, 2024

Commit

9c3d676

1 Parent(s): 5b97345

Leetmonkey In Action via Inference

Browse files

Files changed (1) hide show

app.py +209 -7

app.py CHANGED Viewed

@@ -1,14 +1,216 @@
 import gradio as gr
 import spaces
 import torch
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
 @spaces.GPU
-def greet(n):
-    print(zero.device) # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()

+import os
+import re
+import logging
+import textwrap
+import autopep8
 import gradio as gr
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+import jwt
+from typing import Generator
+from fastapi import FastAPI, HTTPException, Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from pydantic import BaseModel
 import spaces
 import torch
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# JWT settings
+JWT_SECRET = os.environ.get("JWT_SECRET")
+if not JWT_SECRET:
+    raise ValueError("JWT_SECRET environment variable is not set")
+JWT_ALGORITHM = "HS256"
+# Model settings
+MODEL_NAME = "leetmonkey_peft__q8_0.gguf"
+REPO_ID = "sugiv/leetmonkey-peft-gguf"
+# Generation parameters
+generation_kwargs = {
+    "max_tokens": 2048,
+    "stop": ["```", "### Instruction:", "### Response:"],
+    "echo": False,
+    "temperature": 0.2,
+    "top_k": 50,
+    "top_p": 0.95,
+    "repeat_penalty": 1.1
+}
+@spaces.GPU
+def download_model(model_name: str) -> str:
+    logger.info(f"Downloading model: {model_name}")
+    model_path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=model_name,
+        cache_dir="./models",
+        force_download=True,
+        resume_download=True
+    )
+    logger.info(f"Model downloaded: {model_path}")
+    return model_path
+# Download and load the 8-bit model at startup
+model_path = download_model(MODEL_NAME)
+@spaces.GPU
+def load_model(model_path):
+    return Llama(
+        model_path=model_path,
+        n_ctx=2048,
+        n_threads=4,
+        n_gpu_layers=-1,  # Use all available GPU layers
+        verbose=False
+    )
+llm = load_model(model_path)
+logger.info("8-bit model loaded successfully")
+@spaces.GPU
+def generate_solution(instruction: str) -> str:
+    system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
+    full_prompt = f"""### Instruction:
+{system_prompt}
+Implement the following function for the LeetCode problem:
+{instruction}
+### Response:
+Here's the complete Python function implementation:
+```python
+"""
+    response = llm(full_prompt, **generation_kwargs)
+    return response["choices"][0]["text"]
+def extract_and_format_code(text: str) -> str:
+    # Extract code between triple backticks
+    code_match = re.search(r'```python\s*(.*?)\s*```', text, re.DOTALL)
+    if code_match:
+        code = code_match.group(1)
+    else:
+        code = text
+    # Remove any text before the function definition
+    code = re.sub(r'^.*?(?=def\s+\w+\s*\()', '', code, flags=re.DOTALL)
+    # Dedent the code to remove any common leading whitespace
+    code = textwrap.dedent(code)
+    # Split the code into lines
+    lines = code.split('\n')
+    # Find the function definition line
+    func_def_index = next((i for i, line in enumerate(lines) if line.strip().startswith('def ')), 0)
+    # Ensure proper indentation
+    indented_lines = [lines[func_def_index]]  # Keep the function definition as is
+    for line in lines[func_def_index + 1:]:
+        if line.strip():  # If the line is not empty
+            indented_lines.append('    ' + line)  # Add 4 spaces of indentation
+        else:
+            indented_lines.append(line)  # Keep empty lines as is
+    formatted_code = '\n'.join(indented_lines)
+    try:
+        return autopep8.fix_code(formatted_code)
+    except:
+        return formatted_code
+security = HTTPBearer()
+app = FastAPI()
+class ProblemRequest(BaseModel):
+    instruction: str
+def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    try:
+        jwt.decode(credentials.credentials, JWT_SECRET, algorithms=[JWT_ALGORITHM])
+        return True
+    except jwt.PyJWTError:
+        raise HTTPException(status_code=401, detail="Invalid token")
+@app.post("/generate_solution")
 @spaces.GPU
+async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
+    logger.info("Generating solution")
+    generated_output = generate_solution(request.instruction)
+    formatted_code = extract_and_format_code(generated_output)
+    logger.info("Solution generated successfully")
+    return {"solution": formatted_code}
+@app.post("/stream_solution")
+@spaces.GPU
+async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
+    async def generate():
+        logger.info("Streaming solution")
+        system_prompt = "You are a Python coding assistant specialized in solving LeetCode problems. Provide only the complete implementation of the given function. Ensure proper indentation and formatting. Do not include any explanations or multiple solutions."
+        full_prompt = f"""### Instruction:
+{system_prompt}
+Implement the following function for the LeetCode problem:
+{request.instruction}
+### Response:
+Here's the complete Python function implementation:
+```python
+"""
+        generated_text = ""
+        for chunk in llm(full_prompt, stream=True, **generation_kwargs):
+            token = chunk["choices"]["text"]
+            generated_text += token
+            yield token
+        formatted_code = extract_and_format_code(generated_text)
+        logger.info("Solution generated successfully")
+        yield formatted_code
+    return generate()
+# Gradio wrapper for FastAPI
+def gradio_wrapper(app):
+    @spaces.GPU
+    def inference(instruction, token):
+        import requests
+        url = "http://localhost:8000/generate_solution"
+        headers = {"Authorization": f"Bearer {token}"}
+        response = requests.post(url, json={"instruction": instruction}, headers=headers)
+        if response.status_code == 200:
+            return response.json()["solution"]
+        else:
+            return f"Error: {response.status_code}, {response.text}"
+    iface = gr.Interface(
+        fn=inference,
+        inputs=[
+            gr.Textbox(label="LeetCode Problem Instruction"),
+            gr.Textbox(label="JWT Token")
+        ],
+        outputs=gr.Code(label="Generated Solution"),
+        title="LeetCode Problem Solver API",
+        description="Enter a LeetCode problem instruction and your JWT token to generate a solution."
+    )
+    return iface
+if __name__ == "__main__":
+    import uvicorn
+    from threading import Thread
+    # Verify GPU availability
+    zero = torch.Tensor().cuda()
+    print(f"GPU availability: {zero.device}")
+    # Start FastAPI in a separate thread
+    Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=8000)).start()
+    # Launch Gradio interface
+    iface = gradio_wrapper(app)
+    iface.launch(share=True)