from flask import Flask, request, jsonify from llama_cpp import Llama import subprocess # Initialize Flask app app = Flask(__name__) llm = Llama.from_pretrained( repo_id="bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF", filename="DeepSeek-R1-Distill-Llama-8B-Q4_K_L.gguf", ) @app.route("/v1/completions", methods=["POST"]) def generate(): data = request.json prompt = data.get("prompt", "") max_tokens = data.get("max_tokens", 50) temperature = data.get("temperature", 1.0) # Try different argument names response = llm.create_chat_completion( messages = [ { "role": "user", "content": prompt } ], temperature = temperature, max_tokens = max_tokens ) # Return the response as JSON return jsonify({ "id": "1", "object": "text_completion", "created": 1234567890, "model": "DeepSeek-R1-Distill-Llama-8B-IQ2_M.gguf", "choices": [{ "text": response["choices"][0]["message"], "index": 0, "logprobs": None, "finish_reason": "length" }] })