File size: 9,305 Bytes
8889bbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import os
import numpy as np
import argparse
import gradio as gr
from typing import Any, Iterator
from typing import Iterator, List, Optional, Tuple
import filelock
import glob
import json
import time
from gradio.routes import Request
from gradio.utils import SyncToAsyncIterator, async_iteration
from gradio.helpers import special_args
import anyio
from typing import AsyncGenerator, Callable, Literal, Union, cast

from gradio_client.documentation import document, set_documentation_group

from typing import List, Optional, Union, Dict, Tuple
from tqdm.auto import tqdm
from huggingface_hub import snapshot_download
import types

from gradio.components import Button
from gradio.events import Dependency, EventListenerMethod

import types
import sys

from .base_engine import BaseEngine

# ! Remember to use static cache

from ..configs import (
    MODEL_PATH,
    DEFAULT_CHAT_TEMPLATE,
    N_CTX,
    N_GPU_LAYERS,
    IMAGE_TOKEN,
    IMAGE_TOKEN_INTERACTIVE,
    IMAGE_TOKEN_LENGTH,
    MAX_PACHES,
)

from .llama_cpp_engine import (
    encode_tokenize,
    LlamaCppEngine,
)



# resource: https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models

import base64

def image_to_base64_data_uri(file_path):
    with open(file_path, "rb") as img_file:
        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
        return f"data:image/png;base64,{base64_data}"


# file_path = 'file_path.png'
# data_uri = image_to_base64_data_uri(file_path)

# data_uri = image_to_base64_data_uri(file_path)

# messages = [
#     {"role": "system", "content": "You are an assistant who perfectly describes images."},
#     {
#         "role": "user",
#         "content": [
#             {"type": "image_url", "image_url": {"url": data_uri }},
#             {"type" : "text", "text": "Describe this image in detail please."}
#         ]
#     }
# ]
    

def llava_15_chat_handler_call(
        self,
        *,
        llama: Any,
        # messages: List[Any],
        prompt: Union[str, List[int]],
        image_data_uris: Optional[List[Any]] = None,
        image_token: str = None,
        functions: Optional[List[Any]] = None,
        function_call: Optional[Any] = None,
        tools: Optional[List[Any]] = None,
        tool_choice: Optional[Any] = None,
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
        min_p: float = 0.05,
        typical_p: float = 1.0,
        stream: bool = False,
        stop: Optional[Union[str, List[str]]] = [],
        response_format: Optional[
            Any
        ] = None,
        max_tokens: Optional[int] = None,
        presence_penalty: float = 0.0,
        frequency_penalty: float = 0.0,
        repeat_penalty: float = 1.1,
        tfs_z: float = 1.0,
        mirostat_mode: int = 0,
        mirostat_tau: float = 5.0,
        mirostat_eta: float = 0.1,
        model: Optional[str] = None,
        logits_processor: Optional[Any] = None,
        grammar: Optional[Any] = None,
        **kwargs,  # type: ignore
):
    from llama_cpp.llama_chat_format import (
        ctypes,
        suppress_stdout_stderr,
    )
    assert (
        llama.context_params.logits_all is True
    )  # BUG: logits_all=True is required for llava
    assert self.clip_ctx is not None
    # ! split prompt into different parts
    assert image_token is not None
    prompt_parts = prompt.split(image_token)
    # assert len(prompt_parts)
    assert len(prompt_parts) == len(image_data_uris) + 1, f'invalid {len(prompt_parts)=} != {len(image_data_uris)=}'
    llama.reset()
    prefix = prompt_parts[0]
    remaining_texts = prompt_parts[1:]
    llama.reset()
    llama.eval(llama.tokenize(prefix.encode("utf8"), add_bos=True))
    for index, (image_uri, prompt_p) in enumerate(zip(image_data_uris, remaining_texts)):
        image_bytes = self.load_image(image_uri)
        import array
        data_array = array.array("B", image_bytes)
        c_ubyte_ptr = (
            ctypes.c_ubyte * len(data_array)
        ).from_buffer(data_array)
        with suppress_stdout_stderr(disable=self.verbose):
            embed = (
                self._llava_cpp.llava_image_embed_make_with_bytes(
                    self.clip_ctx,
                    llama.context_params.n_threads,
                    c_ubyte_ptr,
                    len(image_bytes),
                )
            )
        try:
            n_past = ctypes.c_int(llama.n_tokens)
            n_past_p = ctypes.pointer(n_past)
            with suppress_stdout_stderr(disable=self.verbose):
                self._llava_cpp.llava_eval_image_embed(
                    llama.ctx,
                    embed,
                    llama.n_batch,
                    n_past_p,
                )
            assert llama.n_ctx() >= n_past.value
            llama.n_tokens = n_past.value
        finally:
            with suppress_stdout_stderr(disable=self.verbose):
                self._llava_cpp.llava_image_embed_free(embed)

        llama.eval(llama.tokenize(prompt_p.encode("utf8"), add_bos=False))
    assert llama.n_ctx() >= llama.n_tokens

    prompt = llama.input_ids[: llama.n_tokens].tolist()
    # from llava-1.5
    return llama.create_completion(
        prompt=prompt,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        min_p=min_p,
        typical_p=typical_p,
        stream=stream,
        stop=stop,
        max_tokens=max_tokens,
        presence_penalty=presence_penalty,
        frequency_penalty=frequency_penalty,
        repeat_penalty=repeat_penalty,
        tfs_z=tfs_z,
        mirostat_mode=mirostat_mode,
        mirostat_tau=mirostat_tau,
        mirostat_eta=mirostat_eta,
        model=model,
        logits_processor=logits_processor,
        grammar=grammar,
    )



class LlavaLlamaCppEngine(LlamaCppEngine):
    """
    Still in development, expect BUGS

    ERROR: could not know why
    objc[61055]: Class GGMLMetalClass is implemented in both miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllama.dylib (0x12cb40290) and miniconda3/envs/native/lib/python3.12/site-packages/llama_cpp/libllava.dylib (0x12d9c8290). One of the two will be used. Which one is undefined.

    """
    @property
    def image_token(self):
        return IMAGE_TOKEN
    
    def get_multimodal_tokens(self, full_prompt, image_paths=None):
        num_tokens = len(self.tokenizer.encode(full_prompt))
        for image_path in image_paths:
            num_tokens += IMAGE_TOKEN_LENGTH * MAX_PACHES
        return num_tokens
    
    def load_model(self):
        # from transformers import AutoTokenizer, AutoModelForCausalLM
        from llama_cpp import Llama
        from llama_cpp.llama_chat_format import Llava15ChatHandler
        model_dir = os.path.dirname(MODEL_PATH)
        self.chat_handler = Llava15ChatHandler(clip_model_path=os.path.join(model_dir, "mmproj.bin"))

        self.chat_handler.__call__ = types.MethodType(llava_15_chat_handler_call, self.chat_handler)
        
        self.model_path = MODEL_PATH
        self._model = Llama(
            model_path=self.model_path,
            n_gpu_layers=N_GPU_LAYERS, # Uncomment to use GPU acceleration
            # seed=1337, # Uncomment to set a specific seed
            chat_handler=self.chat_handler,
            n_ctx=N_CTX, # Uncomment to increase the context window
            logits_all=True, # needed to make llava work
        )
        self._tokenizer = self._model
        self._model.encode = types.MethodType(encode_tokenize, self._model)
        print(f'Load model: {self.model_path=} | {N_GPU_LAYERS=} | {N_CTX=}')
    
    def generate_yield_string(self, prompt, temperature, max_tokens, stop_strings: Optional[Tuple[str]] = None, **kwargs):
        image_paths = kwargs.get("image_paths", [])

        image_data_uris = [
            image_to_base64_data_uri(ip)
            for ip in image_paths
        ]
        
        stop_strings = list(stop_strings) if stop_strings is not None else []
        stop_strings = list(set(stop_strings + ["</s>", "<|im_end|>"]))
        # generator = self._model(
        generator = self.chat_handler(
            prompt=prompt,
            image_data_uris=image_data_uris,
            image_token=self.image_token,
            max_tokens=max_tokens, # Generate up to 32 tokens, set to None to generate up to the end of the context window
            temperature=temperature,
            stop=stop_strings, # Stop generating just before the model would generate a new question
            stream=True,
        )
        response = ""
        num_tokens = len(self.tokenizer.encode(prompt))
        for g in generator:
            response += g['choices'][0]['text']
            yield response, num_tokens

        if response is not None and len(response) > 0:
            num_tokens = len(self.tokenizer.encode(prompt + response))
            yield response, num_tokens
    

"""

export MODEL_PATH
BACKEND=llama_cpp
MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/seallms/SeaLLMs/SeaLLM-7B-v2-gguf/seallm-v2.chatml.Q4_K_M.gguf
N_CTX=4096
python app.py


export BACKEND=llava_llama_cpp
export MODEL_PATH=/Users/nguyenxuanphi/Desktop/projects/cache/llava/llava-1.5/ggml-model-q4_k.gguf
export N_CTX=4096
export IMAGE_TOKEN="<image>"
python app.py


"""