## Due to a small bug when installing exllamav2 from dev branch directly we require CUDA paths import cuda_bug cuda_bug.install_cuda_toolkit_requirements() ## import gradio as gr from gradio.data_classes import FileData from huggingface_hub import snapshot_download from pathlib import Path import base64 import spaces import os import sys, os import torch from exllamav2 import ( ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer, ExLlamaV2VisionTower, ) from exllamav2.generator import ( ExLlamaV2DynamicGenerator, ExLlamaV2Sampler, ) from PIL import Image import requests from huggingface_hub import snapshot_download from tqdm import tqdm default_max_context = 16384 default_max_output = 512 default_bpw = "4.0bpw" available_models = [ "2.5bpw", "3.0bpw", "3.5bpw", "4.0bpw", "4.5bpw", "5.0bpw", "6.0bpw", "8.0bpw" ] dirs = {} for model in tqdm(available_models): dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)}) @spaces.GPU(duration=45) def run_inference(message, history, model_picked, context_size, max_output): if not model_picked: model_picked = default_bpw if not context_size: context_size = default_max_context if not max_output: max_output = default_max_output local_dir = dirs[model_picked] # Loading only once GPU available config = ExLlamaV2Config(local_dir) config.max_seq_len = context_size vision_model = ExLlamaV2VisionTower(config) vision_model.load(progress = True) model = ExLlamaV2(config) cache = ExLlamaV2Cache(model, lazy = True, max_seq_len = context_size) model.load_autosplit(cache, progress = True) tokenizer = ExLlamaV2Tokenizer(config) generator = ExLlamaV2DynamicGenerator( model = model, cache = cache, tokenizer = tokenizer ) # Making Prompt Template prompt = "" image_prompt = "" images_embeddings = [] for couple in history: if type(couple[0]) is tuple: images_embeddings += [ vision_model.get_image_embeddings( model = model, tokenizer = tokenizer, image = img, text_alias = alias, ) for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path)) for i, path in enumerate(couple[0])] ] image_prompt = "" for i in range(len(couple[0])): image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(couple[0])+i+1) + "}}" elif couple[0]: prompt += "[INST]" + image_prompt + couple[0] + "[/INST]" prompt += couple[1] + "" if type(message) is dict: images_embeddings += [ vision_model.get_image_embeddings( model = model, tokenizer = tokenizer, image = img, text_alias = alias, ) for (alias, img) in [("{{IMAGE_" + str(len(images_embeddings)+i+1) + "}}", Image.open(path['path'] if type(path) is dict else path)) for i, path in enumerate(message['files'])] ] image_prompt = "" for i in range(len(message['files'])): image_prompt += "{{IMAGE_" + str(len(images_embeddings)-len(message['files'])+i+1) + "}}" prompt += "[INST]" + image_prompt + message["text"] + "[/INST]" else: prompt += "[INST]" + image_prompt + message + "[/INST]" print(prompt) # Gnerating Response output = generator.generate( prompt = prompt, max_new_tokens = max_output, add_bos = True, encode_special_tokens = True, decode_special_tokens = True, stop_conditions = [tokenizer.eos_token_id], gen_settings = ExLlamaV2Sampler.Settings.greedy(), embeddings = images_embeddings ) result = output.split("[/INST]")[-1] print(result) return result description="""A demo chat interface with Pixtral 12B EXL2 Quants, deployed using **ExllamaV2**! The model will be loaded once the GPU is available. This space specifically will load by default Pixtral at 4bpw from the following repository: [turboderp/pixtral-12b-exl2](https://huggingface.co/turboderp/pixtral-12b-exl2). Other quantization options are available. The current version of ExllamaV2 running is the dev branch, not the master branch: [ExllamaV2](https://github.com/turboderp/exllamav2/tree/dev). The model at **4bpw and 16k context size fits in less than 12GB of VRAM**, and at **2.5bpw and short context can potentially fit in 8GB of VRAM**! The current default settings are: - Model Quant: 4.0bpw - Context Size: 16k tokens - Max Output: 512 tokens You can select other quants and experiment! Thanks, turboderp!""" examples = [ [ {"text": "What are the similarities and differences between these two experiments?", "files":["test_image_1.jpg", "test_image_2.jpg"]}, ] ] drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw) context_size_gradio = gr.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1) output_length_gradio = gr.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1) demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, context_size_gradio, output_length_gradio]) demo.queue().launch()