Leetmonkey In Action via Inference
Browse files
app.py
CHANGED
@@ -39,7 +39,6 @@ generation_kwargs = {
|
|
39 |
"repeat_penalty": 1.1
|
40 |
}
|
41 |
|
42 |
-
@spaces.GPU
|
43 |
def download_model(model_name: str) -> str:
|
44 |
logger.info(f"Downloading model: {model_name}")
|
45 |
model_path = hf_hub_download(
|
@@ -136,7 +135,6 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
|
136 |
raise HTTPException(status_code=401, detail="Invalid token")
|
137 |
|
138 |
@app.post("/generate_solution")
|
139 |
-
@spaces.GPU
|
140 |
async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
|
141 |
logger.info("Generating solution")
|
142 |
generated_output = generate_solution(request.instruction)
|
@@ -145,7 +143,6 @@ async def generate_solution_api(request: ProblemRequest, authorized: bool = Depe
|
|
145 |
return {"solution": formatted_code}
|
146 |
|
147 |
@app.post("/stream_solution")
|
148 |
-
@spaces.GPU
|
149 |
async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
|
150 |
async def generate():
|
151 |
logger.info("Streaming solution")
|
@@ -177,7 +174,6 @@ Here's the complete Python function implementation:
|
|
177 |
|
178 |
# Gradio wrapper for FastAPI
|
179 |
def gradio_wrapper(app):
|
180 |
-
@spaces.GPU
|
181 |
def inference(instruction, token):
|
182 |
import requests
|
183 |
url = "http://localhost:8000/generate_solution"
|
@@ -200,17 +196,24 @@ def gradio_wrapper(app):
|
|
200 |
)
|
201 |
return iface
|
202 |
|
203 |
-
|
204 |
-
|
205 |
-
from threading import Thread
|
206 |
-
|
207 |
# Verify GPU availability
|
208 |
zero = torch.Tensor().cuda()
|
209 |
print(f"GPU availability: {zero.device}")
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
# Start FastAPI in a separate thread
|
212 |
Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=8000)).start()
|
213 |
|
214 |
# Launch Gradio interface
|
215 |
iface = gradio_wrapper(app)
|
216 |
iface.launch(share=True)
|
|
|
|
|
|
|
|
39 |
"repeat_penalty": 1.1
|
40 |
}
|
41 |
|
|
|
42 |
def download_model(model_name: str) -> str:
|
43 |
logger.info(f"Downloading model: {model_name}")
|
44 |
model_path = hf_hub_download(
|
|
|
135 |
raise HTTPException(status_code=401, detail="Invalid token")
|
136 |
|
137 |
@app.post("/generate_solution")
|
|
|
138 |
async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
|
139 |
logger.info("Generating solution")
|
140 |
generated_output = generate_solution(request.instruction)
|
|
|
143 |
return {"solution": formatted_code}
|
144 |
|
145 |
@app.post("/stream_solution")
|
|
|
146 |
async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
|
147 |
async def generate():
|
148 |
logger.info("Streaming solution")
|
|
|
174 |
|
175 |
# Gradio wrapper for FastAPI
|
176 |
def gradio_wrapper(app):
|
|
|
177 |
def inference(instruction, token):
|
178 |
import requests
|
179 |
url = "http://localhost:8000/generate_solution"
|
|
|
196 |
)
|
197 |
return iface
|
198 |
|
199 |
+
@spaces.GPU
|
200 |
+
def main():
|
|
|
|
|
201 |
# Verify GPU availability
|
202 |
zero = torch.Tensor().cuda()
|
203 |
print(f"GPU availability: {zero.device}")
|
204 |
|
205 |
+
# Download and load the model
|
206 |
+
model_path = download_model(MODEL_NAME)
|
207 |
+
global llm
|
208 |
+
llm = load_model(model_path)
|
209 |
+
logger.info("8-bit model loaded successfully")
|
210 |
+
|
211 |
# Start FastAPI in a separate thread
|
212 |
Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=8000)).start()
|
213 |
|
214 |
# Launch Gradio interface
|
215 |
iface = gradio_wrapper(app)
|
216 |
iface.launch(share=True)
|
217 |
+
|
218 |
+
if __name__ == "__main__":
|
219 |
+
main()
|