import os import gradio as gr from ctransformers import AutoModelForCausalLM model_repo = os.getenv('HF_MODEL_REPO') model_bin = os.getenv('HF_MODEL_BIN') llm = AutoModelForCausalLM.from_pretrained( model_repo, model_file=model_bin, threads=2, seed=42, context_length=16384, lib="avx2", ) def response(prompt): txt = llm(prompt, max_new_tokens=8192, temperature=0.8, top_p=0.5, repetition_penalty=1.1, reset=False, stop=["","<|im_end|>"], ) return txt if __name__ == '__main__': title = "Chat" demo_status = "Demo is running on CPU" gr.Interface(response, inputs="text", outputs="text", title=title, ).launch()