sugiv commited on
Commit
3f58090
·
1 Parent(s): 9c3d676

Leetmonkey In Action via Inference

Browse files
Files changed (1) hide show
  1. app.py +11 -8
app.py CHANGED
@@ -39,7 +39,6 @@ generation_kwargs = {
39
  "repeat_penalty": 1.1
40
  }
41
 
42
- @spaces.GPU
43
  def download_model(model_name: str) -> str:
44
  logger.info(f"Downloading model: {model_name}")
45
  model_path = hf_hub_download(
@@ -136,7 +135,6 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
136
  raise HTTPException(status_code=401, detail="Invalid token")
137
 
138
  @app.post("/generate_solution")
139
- @spaces.GPU
140
  async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
141
  logger.info("Generating solution")
142
  generated_output = generate_solution(request.instruction)
@@ -145,7 +143,6 @@ async def generate_solution_api(request: ProblemRequest, authorized: bool = Depe
145
  return {"solution": formatted_code}
146
 
147
  @app.post("/stream_solution")
148
- @spaces.GPU
149
  async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
150
  async def generate():
151
  logger.info("Streaming solution")
@@ -177,7 +174,6 @@ Here's the complete Python function implementation:
177
 
178
  # Gradio wrapper for FastAPI
179
  def gradio_wrapper(app):
180
- @spaces.GPU
181
  def inference(instruction, token):
182
  import requests
183
  url = "http://localhost:8000/generate_solution"
@@ -200,17 +196,24 @@ def gradio_wrapper(app):
200
  )
201
  return iface
202
 
203
- if __name__ == "__main__":
204
- import uvicorn
205
- from threading import Thread
206
-
207
  # Verify GPU availability
208
  zero = torch.Tensor().cuda()
209
  print(f"GPU availability: {zero.device}")
210
 
 
 
 
 
 
 
211
  # Start FastAPI in a separate thread
212
  Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=8000)).start()
213
 
214
  # Launch Gradio interface
215
  iface = gradio_wrapper(app)
216
  iface.launch(share=True)
 
 
 
 
39
  "repeat_penalty": 1.1
40
  }
41
 
 
42
  def download_model(model_name: str) -> str:
43
  logger.info(f"Downloading model: {model_name}")
44
  model_path = hf_hub_download(
 
135
  raise HTTPException(status_code=401, detail="Invalid token")
136
 
137
  @app.post("/generate_solution")
 
138
  async def generate_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
139
  logger.info("Generating solution")
140
  generated_output = generate_solution(request.instruction)
 
143
  return {"solution": formatted_code}
144
 
145
  @app.post("/stream_solution")
 
146
  async def stream_solution_api(request: ProblemRequest, authorized: bool = Depends(verify_token)):
147
  async def generate():
148
  logger.info("Streaming solution")
 
174
 
175
  # Gradio wrapper for FastAPI
176
  def gradio_wrapper(app):
 
177
  def inference(instruction, token):
178
  import requests
179
  url = "http://localhost:8000/generate_solution"
 
196
  )
197
  return iface
198
 
199
+ @spaces.GPU
200
+ def main():
 
 
201
  # Verify GPU availability
202
  zero = torch.Tensor().cuda()
203
  print(f"GPU availability: {zero.device}")
204
 
205
+ # Download and load the model
206
+ model_path = download_model(MODEL_NAME)
207
+ global llm
208
+ llm = load_model(model_path)
209
+ logger.info("8-bit model loaded successfully")
210
+
211
  # Start FastAPI in a separate thread
212
  Thread(target=lambda: uvicorn.run(app, host="0.0.0.0", port=8000)).start()
213
 
214
  # Launch Gradio interface
215
  iface = gradio_wrapper(app)
216
  iface.launch(share=True)
217
+
218
+ if __name__ == "__main__":
219
+ main()