import os # to check if file exists import sys # to flush stdout import markdown # to render answer import gradio as gr #import transformers #from transformers import pipeline from llama_cpp import Llama from huggingface_hub import hf_hub_download model_repo="TheBloke/Nous-Hermes-13B-GGML" model_filename="nous-hermes-13b.ggmlv3.q4_K_S.bin" #model="TheBloke/Nous-Hermes-13B-GGML" #model="https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q4_K_S.bin" def download_model(): # See https://github.com/OpenAccess-AI-Collective/ggml-webui/blob/main/tabbed.py file_path="/home/user/.cache/huggingface/hub/models--TheBloke--Nous-Hermes-13B-GGML/snapshots/f1a48f90a07550e1ba30e347b2be69d4fa5e393b/nous-hermes-13b.ggmlv3.q4_K_S.bin" if os.path.exists(file_path): return file_path else: print("Downloading model...") sys.stdout.flush() file = hf_hub_download( repo_id=model_repo, filename=model_filename ) print("Downloaded " + file) return file def question_answer(context, question, max_tokens): mfile=download_model() # structure the prompt to make it easier for the ai question1="\"\"\"\n" + question + "\n\"\"\"\n" text=context + "\n\nQuestion: " + question1 + "\nPlease use markdown formatting for answer. \nAnswer:\n" llm = Llama(model_path=mfile) output = llm(text, max_tokens=max_tokens, stop=["### Response"], echo=True) print(output) # remove the context and leave only the answer answer=output['choices'][0]['text'] answer = answer.replace(text, "", 1) # render the markdown and return the html and question html_answer = markdown.markdown(answer) return question, html_answer ''' Output is of the form: { "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", "object": "text_completion", "created": 1679561337, "model": "./models/7B/ggml-model.bin", "choices": [ { "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.", "index": 0, "logprobs": None, "finish_reason": "stop" } ], "usage": { "prompt_tokens": 14, "completion_tokens": 28, "total_tokens": 42 } } ''' # old transformers code #generator = pipeline(model=model, device_map="auto") #return generator(text) app=gr.Interface(fn=question_answer, inputs=["text", "text",gr.Slider(33, 2333)], outputs=["textbox", "html"]) app.launch()