joy-caption-pre-alpha-mod

Running on Zero

App Files Files Community

John6666 commited on Sep 25, 2024

Commit

bfd1b89

verified ·

1 Parent(s): 59e10a3

Upload 2 files

Browse files

Files changed (1) hide show

joycaption.py +6 -122

joycaption.py CHANGED Viewed

@@ -33,8 +33,9 @@ use_inference_client = False
 PIXTRAL_PATH = "mistral-community/pixtral-12b"
 llm_models = {
-    "bunnycore/LLama-3.1-8B-Matrix": None,
     #PIXTRAL_PATH: None,
     "Sao10K/Llama-3.1-8B-Stheno-v3.4": None,
     "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
     "DevQuasar/HermesNova-Llama-3.1-8B": None,
@@ -157,6 +158,8 @@ def load_text_model(model_name: str=MODEL_PATH, gguf_file: Union[str, None]=None
             else:
                 text_model = LlavaForConditionalGeneration.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
                 image_adapter = AutoProcessor.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16)
         print("Loading tokenizer")
         if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
@@ -217,88 +220,10 @@ clip_model.eval().requires_grad_(False).to(device)
 load_text_model()
 @spaces.GPU()
-@torch.no_grad()
-def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int]) -> str:
-    torch.cuda.empty_cache()
-    # 'any' means no length specified
-    length = None if caption_length == "any" else caption_length
-    if isinstance(length, str):
-        try:
-            length = int(length)
-        except ValueError:
-            pass
-    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
-    if caption_type == "rng-tags" or caption_type == "training_prompt":
-        caption_tone = "formal"
-    # Build prompt
-    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-    if prompt_key not in CAPTION_TYPE_MAP:
-        raise ValueError(f"Invalid caption type: {prompt_key}")
-    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(length=length, word_count=length)
-    print(f"Prompt: {prompt_str}")
-    # Preprocess image
-    #image = clip_processor(images=input_image, return_tensors='pt').pixel_values
-    image = input_image.resize((384, 384), Image.LANCZOS)
-    pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
-    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-    pixel_values = pixel_values.to('cuda')
-    # Tokenize the prompt
-    prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-    # Embed image
-    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-        image_features = vision_outputs.hidden_states
-        embedded_images = image_adapter(image_features)
-        embedded_images = embedded_images.to('cuda')
-    # Embed prompt
-    prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-    embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
-    eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-    # Construct prompts
-    inputs_embeds = torch.cat([
-        embedded_bos.expand(embedded_images.shape[0], -1, -1),
-        embedded_images.to(dtype=embedded_bos.dtype),
-        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-        eot_embed.expand(embedded_images.shape[0], -1, -1),
-    ], dim=1)
-    input_ids = torch.cat([
-        torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
-        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
-        prompt,
-        torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
-    ], dim=1).to('cuda')
-    attention_mask = torch.ones_like(input_ids)
-    #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
-    #generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)   # Uses the default which is temp=0.6, top_p=0.9
-    # Trim off the prompt
-    generate_ids = generate_ids[:, input_ids.shape[1]:]
-    if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
-        generate_ids = generate_ids[:, :-1]
-    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-    return caption.strip()
-@spaces.GPU()
-@torch.no_grad()
 def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int],
                     max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> str:
-    global use_inference_client, text_model
     torch.cuda.empty_cache()
     gc.collect()
@@ -476,44 +401,3 @@ def change_text_model(model_name: str=MODEL_PATH, use_client: bool=False, gguf_f
         return gr.update(choices=get_text_model())
     except Exception as e:
         raise gr.Error(f"Model load error: {model_name}, {e}")
-# original UI
-with gr.Blocks() as demo:
-    gr.HTML(TITLE)
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(type="pil", label="Input Image")
-            caption_type = gr.Dropdown(
-                choices=["descriptive", "training_prompt", "rng-tags"],
-                label="Caption Type",
-                value="descriptive",
-            )
-            caption_tone = gr.Dropdown(
-                choices=["formal", "informal"],
-                label="Caption Tone",
-                value="formal",
-            )
-            caption_length = gr.Dropdown(
-                choices=["any", "very short", "short", "medium-length", "long", "very long"] +
-                        [str(i) for i in range(20, 261, 10)],
-                label="Caption Length",
-                value="any",
-            )
-            gr.Markdown("**Note:** Caption tone doesn't affect `rng-tags` and `training_prompt`.")
-            run_button = gr.Button("Caption")
-        with gr.Column():
-            output_caption = gr.Textbox(label="Caption")
-    run_button.click(fn=stream_chat, inputs=[input_image, caption_type, caption_tone, caption_length], outputs=[output_caption])
-if __name__ == "__main__":
-    demo.launch()

 PIXTRAL_PATH = "mistral-community/pixtral-12b"
 llm_models = {
+    "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2": None,
     #PIXTRAL_PATH: None,
+    "bunnycore/LLama-3.1-8B-Matrix": None,
     "Sao10K/Llama-3.1-8B-Stheno-v3.4": None,
     "unsloth/Meta-Llama-3.1-8B-bnb-4bit": None,
     "DevQuasar/HermesNova-Llama-3.1-8B": None,
             else:
                 text_model = LlavaForConditionalGeneration.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16).eval()
                 image_adapter = AutoProcessor.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16)
+            tokenizer = None
+            peft_config = None
         print("Loading tokenizer")
         if gguf_file: tokenizer = AutoTokenizer.from_pretrained(model_name, gguf_file=gguf_file, use_fast=True, legacy=False)
 load_text_model()
 @spaces.GPU()
+@torch.inference_mode()
 def stream_chat_mod(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: Union[str, int],
                     max_new_tokens: int=300, top_p: float=0.9, temperature: float=0.6, model_name: str=MODEL_PATH, progress=gr.Progress(track_tqdm=True)) -> str:
+    global tokenizer, text_model, image_adapter, peft_config, text_model_client, use_inference_client
     torch.cuda.empty_cache()
     gc.collect()
         return gr.update(choices=get_text_model())
     except Exception as e:
         raise gr.Error(f"Model load error: {model_name}, {e}")