Spaces:
Running
on
Zero
Running
on
Zero
File size: 9,305 Bytes
8889bbb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
import os
import numpy as np
import argparse
import gradio as gr
from typing import Any, Iterator
from typing import Iterator, List, Optional, Tuple
import filelock
import glob
import json
import time
from gradio.routes import Request
from gradio.utils import SyncToAsyncIterator, async_iteration
from gradio.helpers import special_args
import anyio
from typing import AsyncGenerator, Callable, Literal, Union, cast
from gradio_client.documentation import document, set_documentation_group
from typing import List, Optional, Union, Dict, Tuple
from tqdm.auto import tqdm
from huggingface_hub import snapshot_download
import types
from gradio.components import Button
from gradio.events import Dependency, EventListenerMethod
import types
import sys
from .base_engine import BaseEngine
# ! Remember to use static cache
from ..configs import (
MODEL_PATH,
DEFAULT_CHAT_TEMPLATE,
N_CTX,
N_GPU_LAYERS,
IMAGE_TOKEN,
IMAGE_TOKEN_INTERACTIVE,
IMAGE_TOKEN_LENGTH,
MAX_PACHES,
)
from .llama_cpp_engine import (
encode_tokenize,
LlamaCppEngine,
)
# resource: https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models
import base64
def image_to_base64_data_uri(file_path):
with open(file_path, "rb") as img_file:
base64_data = base64.b64encode(img_file.read()).decode('utf-8')
return f"data:image/png;base64,{base64_data}"
# file_path = 'file_path.png'
# data_uri = image_to_base64_data_uri(file_path)
# data_uri = image_to_base64_data_uri(file_path)
# messages = [
# {"role": "system", "content": "You are an assistant who perfectly describes images."},
# {
# "role": "user",
# "content": [
# {"type": "image_url", "image_url": {"url": data_uri }},
# {"type" : "text", "text": "Describe this image in detail please."}
# ]
# }
# ]
def llava_15_chat_handler_call(
self,
*,
llama: Any,
# messages: List[Any],
prompt: Union[str, List[int]],
image_data_uris: Optional[List[Any]] = None,
image_token: str = None,
functions: Optional[List[Any]] = None,
function_call: Optional[Any] = None,
tools: Optional[List[Any]] = None,
tool_choice: Optional[Any] = None,
temperature: float = 0.2,
top_p: float = 0.95,
top_k: int = 40,
min_p: float = 0.05,
typical_p: float = 1.0,
stream: bool = False,
stop: Optional[Union[str, List[str]]] = [],
response_format: Optional[
Any
] = None,
max_tokens: Optional[int] = None,
presence_penalty: float = 0.0,
frequency_penalty: float = 0.0,
repeat_penalty: float = 1.1,
tfs_z: float = 1.0,
mirostat_mode: int = 0,
mirostat_tau: float = 5.0,
mirostat_eta: float = 0.1,
model: Optional[str] = None,
logits_processor: Optional[Any] = None,
grammar: Optional[Any] = None,
**kwargs, # type: ignore
):
from llama_cpp.llama_chat_format import (
ctypes,
suppress_stdout_stderr,
)
assert (
llama.context_params.logits_all is True
) # BUG: logits_all=True is required for llava
assert self.clip_ctx is not None
# ! split prompt into different parts
assert image_token is not None
prompt_parts = prompt.split(image_token)
# assert len(prompt_parts)
assert len(prompt_parts) == len(image_data_uris) + 1, f'invalid {len(prompt_parts)=} != {len(image_data_uris)=}'
llama.reset()
prefix = prompt_parts[0]
remaining_texts = prompt_parts[1:]
llama.reset()
llama.eval(llama.tokenize(prefix.encode("utf8"), add_bos=True))
for index, (image_uri, prompt_p) in enumerate(zip(image_data_uris, remaining_texts)):
image_bytes = self.load_image(image_uri)
import array
data_array = array.array("B", image_bytes)
c_ubyte_ptr = (
ctypes.c_ubyte * len(data_array)
).from_buffer(data_array)
with suppress_stdout_stderr(disable=self.verbose):
embed = (
self._llava_cpp.llava_image_embed_make_with_bytes(
self.clip_ctx,
llama.context_params.n_threads,
c_ubyte_ptr,
len(image_bytes),
)
)
try:
n_past = ctypes.c_int(llama.n_tokens)
n_past_p = ctypes.pointer(n_past)
with suppress_stdout_stderr(disable=self.verbose):
self._llava_cpp.llava_eval_image_embed(
llama.ctx,
embed,
llama.n_batch,
n_past_p,
)
assert llama.n_ctx() >= n_past.value
llama.n_tokens = n_past.value
finally:
with suppress_stdout_stderr(disable=self.verbose):
self._llava_cpp.llava_image_embed_free(embed)
llama.eval(llama.tokenize(prompt_p.encode("utf8"), add_bos=False))
assert llama.n_ctx() >= llama.n_tokens
prompt = llama.input_ids[: llama.n_tokens].tolist()
# from llava-1.5
return llama.create_completion(
prompt=prompt,
temperature=temperature,
top_p=top_p,
top_k=top_k,
min_p=min_p,
typical_p=typical_p,
stream=stream,
stop=stop,
max_tokens=max_tokens,
presence_penalty=presence_penalty,
frequency_penalty=frequency_penalty,
repeat_penalty=repeat_penalty,
tfs_z=tfs_z,
mirostat_mode=mirostat_mode,
mirostat_tau=mirostat_tau,
mirostat_eta=mirostat_eta,
model=model,
logits_processor=logits_processor,
grammar=grammar,
)
class LlavaLlamaCppEngine(LlamaCppEngine):
"""
Still in development, expect BUGS
ERROR: could not know why
objc[61055]: Class GGMLMetalClass is implemented in both miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllama.dylib (0x12cb40290) and miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllava.dylib (0x12d9c8290). One of the two will be used. Which one is undefined.
"""
@property
def image_token(self):
return IMAGE_TOKEN
def get_multimodal_tokens(self, full_prompt, image_paths=None):
num_tokens = len(self.tokenizer.encode(full_prompt))
for image_path in image_paths:
num_tokens += IMAGE_TOKEN_LENGTH * MAX_PACHES
return num_tokens
def load_model(self):
# from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
model_dir = os.path.dirname(MODEL_PATH)
self.chat_handler = Llava15ChatHandler(clip_model_path=os.path.join(model_dir, "mmproj.bin"))
self.chat_handler.__call__ = types.MethodType(llava_15_chat_handler_call, self.chat_handler)
self.model_path = MODEL_PATH
self._model = Llama(
model_path=self.model_path,
n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
chat_handler=self.chat_handler,
n_ctx=N_CTX, # Uncomment to increase the context window
logits_all=True, # needed to make llava work
)
self._tokenizer = self._model
self._model.encode = types.MethodType(encode_tokenize, self._model)
print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}')
def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
image_paths = kwargs.get("image_paths", [])
image_data_uris = [
image_to_base64_data_uri(ip)
for ip in image_paths
]
stop_strings = list(stop_strings) if stop_strings is not None else []
stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"]))
# generator = self._model(
generator = self.chat_handler(
prompt=prompt,
image_data_uris=image_data_uris,
image_token=self.image_token,
max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window
temperature=temperature,
stop=stop_strings, # Stop generating just before the model would generate a new question
stream=True,
)
response = ""
num_tokens = len(self.tokenizer.encode(prompt))
for g in generator:
response += g['choices'][0]['text']
yield response, num_tokens
if response is not None and len(response) > 0:
num_tokens = len(self.tokenizer.encode(prompt + response))
yield response, num_tokens
"""
export MODEL_PATH
BACKEND=llama_cpp
MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/seallms/SeaLLMs/SeaLLM-7B-v2-gguf/seallm-v2.chatml.Q4_K_M.gguf
N_CTX=4096
python app.py
export BACKEND=llava_llama_cpp
export MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/llava/llava-1.5/ggml-model-q4_k.gguf
export N_CTX=4096
export IMAGE_TOKEN="<image>"
python app.py
""" |