joy-caption-pre-alpha-mod

Running on Zero

App Files Files Community

John6666 commited on Sep 25, 2024

Commit

59e10a3

verified ·

1 Parent(s): cd64e85

Upload 2 files

Browse files

Files changed (2) hide show

app.py +2 -2
joycaption.py +21 -6

app.py CHANGED Viewed

@@ -49,14 +49,14 @@ with gr.Blocks(fill_width=True, css=css, delete_cache=(60, 3600)) as demo:
                         jc_temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature")
                         jc_topp = gr.Slider(minimum=0, maximum=2.0, value=0.9, step=0.01, label="Top-P")
             jc_run_button = gr.Button("Caption", variant="primary")
         with gr.Column():
             jc_output_caption = gr.Textbox(label="Caption", show_copy_button=True)
     gr.Markdown(JC_DESC_MD, elem_classes="info")
     gr.LoginButton()
     gr.DuplicateButton(value="Duplicate Space for private use (This demo does not work on CPU. Requires GPU Space)")
-    jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_caption_type, jc_caption_tone, jc_caption_length, jc_tokens, jc_topp, jc_temperature], outputs=[jc_output_caption])
     jc_text_model_button.click(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
     #jc_text_model.change(get_repo_gguf, [jc_text_model], [jc_gguf], show_api=False)
     jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)

                         jc_temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.6, step=0.1, label="Temperature")
                         jc_topp = gr.Slider(minimum=0, maximum=2.0, value=0.9, step=0.01, label="Top-P")
             jc_run_button = gr.Button("Caption", variant="primary")
         with gr.Column():
             jc_output_caption = gr.Textbox(label="Caption", show_copy_button=True)
     gr.Markdown(JC_DESC_MD, elem_classes="info")
     gr.LoginButton()
     gr.DuplicateButton(value="Duplicate Space for private use (This demo does not work on CPU. Requires GPU Space)")
+    jc_run_button.click(fn=stream_chat_mod, inputs=[jc_input_image, jc_caption_type, jc_caption_tone, jc_caption_length,
+                                                    jc_tokens, jc_topp, jc_temperature, jc_text_model], outputs=[jc_output_caption])
     jc_text_model_button.click(change_text_model, [jc_text_model, jc_use_inference_client, jc_gguf, jc_nf4], [jc_text_model], show_api=False)
     #jc_text_model.change(get_repo_gguf, [jc_text_model], [jc_gguf], show_api=False)
     jc_use_inference_client.change(change_text_model, [jc_text_model, jc_use_inference_client], [jc_text_model], show_api=False)

joycaption.py CHANGED Viewed

@@ -30,9 +30,11 @@ BASE_DIR = Path(__file__).resolve().parent
 device = "cuda" if torch.cuda.is_available() else "cpu"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 use_inference_client = False
 llm_models = {
     "bunnycore/LLama-3.1-8B-Matrix": None,
     "Sao10K/Llama-3.1-8B-Stheno-v3.4": None,
     "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
     "DevQuasar/HermesNova-Llama-3.1-8B": None,
@@ -123,7 +125,6 @@ class ImageAdapter(nn.Module):
     def get_eot_embedding(self):
         return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
 # https://huggingface.co/docs/transformers/v4.44.2/gguf
 # https://github.com/city96/ComfyUI-GGUF/issues/7
 # https://github.com/THUDM/ChatGLM-6B/issues/18
@@ -147,6 +148,15 @@ def load_text_model(model_name: str=MODEL_PATH, gguf_file: Union[str, None]=None
         from transformers import BitsAndBytesConfig
         nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
         print("Loading tokenizer")
         if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
@@ -286,7 +296,8 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str,
 @spaces.GPU()
 @torch.no_grad()
-def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int], max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, progress=gr.Progress(track_tqdm=True)) -> str:
     global use_inference_client, text_model
     torch.cuda.empty_cache()
     gc.collect()
@@ -312,8 +323,15 @@ def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: s
     prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
     print(f"Prompt: {prompt_str}")
     # Preprocess image
-    #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
@@ -352,9 +370,6 @@ def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: s
     attention_mask = torch.ones_like(input_ids)
     text_model.to(device)
-    #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
-    #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
-    #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)   # Uses the default which is temp=0.6, top_p=0.9
     generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens,
                                        do_sample=True, suppress_tokens=None, top_p=top_p, temperature=temperature)

 device = "cuda" if torch.cuda.is_available() else "cpu"
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 use_inference_client = False
+PIXTRAL_PATH = "mistral-community/pixtral-12b"
 llm_models = {
     "bunnycore/LLama-3.1-8B-Matrix": None,
+    #PIXTRAL_PATH: None,
     "Sao10K/Llama-3.1-8B-Stheno-v3.4": None,
     "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
     "DevQuasar/HermesNova-Llama-3.1-8B": None,
     def get_eot_embedding(self):
         return self.other_tokens(torch.tensor([2], device=self.other_tokens.weight.device)).squeeze(0)
 # https://huggingface.co/docs/transformers/v4.44.2/gguf
 # https://github.com/city96/ComfyUI-GGUF/issues/7
 # https://github.com/THUDM/ChatGLM-6B/issues/18
         from transformers import BitsAndBytesConfig
         nf4_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                                         bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
+        if model_name == PIXTRAL_PATH:
+            from transformers import AutoProcessor, LlavaForConditionalGeneration
+            if is_nf4:
+                text_model = LlavaForConditionalGeneration.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16).eval()
+                image_adapter = AutoProcessor.from_pretrained(model_name, quantization_config=nf4_config, device_map=device, torch_dtype=torch.bfloat16)
+            else:
+                text_model = LlavaForConditionalGeneration.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
+                image_adapter = AutoProcessor.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16)
         print("Loading tokenizer")
         if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
 @spaces.GPU()
 @torch.no_grad()
+def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int],
+                    max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> str:
     global use_inference_client, text_model
     torch.cuda.empty_cache()
     gc.collect()
     prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
     print(f"Prompt: {prompt_str}")
+    # Pixtral
+    if model_name == PIXTRAL_PATH:
+        input_images = [input_image]
+        inputs = image_adapter(text=prompt_str, images=input_images, return_tensors="pt").to(device)
+        generate_ids = text_model.generate(**inputs, max_new_tokens=max_new_tokens)
+        output = image_adapter.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        return output.strip()
     # Preprocess image
     image = input_image.resize((384, 384), Image.LANCZOS)
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
     pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
     attention_mask = torch.ones_like(input_ids)
     text_model.to(device)
     generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens,
                                        do_sample=True, suppress_tokens=None, top_p=top_p, temperature=temperature)