from fastapi import FastAPI from pydantic import BaseModel import uvicorn import prompt_style import os from huggingface_hub import hf_hub_download # from llama_cpp import Llama import time # model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF" # filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf" # model_path = hf_hub_download(repo_id=model_id, filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf", token=os.environ['HF_TOKEN']) # model = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=False) # model = Llama.from_pretrained(repo_id=model_id, filename=filename, n_gpu_layers=-1, token=os.environ['HF_TOKEN'], # n_ctx=4096, verbose=False, attn_implementation="flash_attention_2") from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3" model_8bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True), token=os.environ['HF_TOKEN'], attn_implementation="flash_attention_2") class Item(BaseModel): prompt: str history: list system_prompt: str temperature: float = 0.8 max_new_tokens: int = 1024 top_p: float = 0.95 repetition_penalty: float = 1.0 seed : int = 42 app = FastAPI() def format_prompt(item: Item): messages = [ {"role": "system", "content": prompt_style.data}, ] for it in item.history: messages.append({"role" : "user", "content": it[0]}) messages.append({"role" : "assistant", "content": it[1]}) messages.append({"role" : "user", "content": item.prompt}) return messages def generate(item: Item): formatted_prompt = format_prompt(item) # output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed, # temperature=item.temperature, max_tokens=item.max_new_tokens) # out = output['choices'][0]['message']['content'] # return out input_ids = tokenizer.apply_chat_template( formatted_prompt, add_generation_prompt=True, return_tensors="pt" ).to("cuda") terminators = [ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>") ] outputs = model_8bit.generate( input_ids, max_new_tokens=item.max_new_tokens, eos_token_id=terminators, do_sample=True, temperature=item.temperature, top_p=item.top_p, ) response = outputs[0][input_ids.shape[-1]:] return tokenizer.decode(response, skip_special_tokens=True) # inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # generated_ids = model.generate(**inputs) # outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) @app.post("/generate/") async def generate_text(item: Item): t1 = time.time() ans = generate(item) print(ans) print(f"time: {str(time.time() - t1)}") return {"response": ans} @app.get("/") def read_root(): return {"Hello": "Worlds"}