MadsGalsgaard commited on
Commit
e2917dc
1 Parent(s): 89f97b0

Deployed model

Browse files
Files changed (1) hide show
  1. app.py +260 -260
app.py CHANGED
@@ -290,318 +290,318 @@
290
  ###########new clientkey 04 ruunng chlrhah
291
 
292
 
293
- import os
294
- import time
295
- import spaces
296
- import torch
297
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
298
- import gradio as gr
299
- from threading import Thread
300
 
301
- MODEL = "THUDM/LongWriter-llama3.1-8b"
302
-
303
- TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>"
304
-
305
- PLACEHOLDER = """
306
- <center>
307
- <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p>
308
- </center>
309
- """
310
-
311
- CSS = """
312
- .duplicate-button {
313
- margin: auto !important;
314
- color: white !important;
315
- background: black !important;
316
- border-radius: 100vh !important;
317
- }
318
- h3 {
319
- text-align: center;
320
- }
321
- """
322
-
323
- device = "cuda" if torch.cuda.is_available() else "cpu"
324
-
325
- tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
326
- model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
327
- model = model.eval()
328
-
329
- @spaces.GPU()
330
- def stream_chat(
331
- message: str,
332
- history: list,
333
- system_prompt: str,
334
- temperature: float = 0.5,
335
- max_new_tokens: int = 32768,
336
- top_p: float = 1.0,
337
- top_k: int = 50,
338
- ):
339
- print(f'message: {message}')
340
- print(f'history: {history}')
341
-
342
- full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
343
- for prompt, answer in history:
344
- full_prompt += f"[INST]{prompt}[/INST]{answer}"
345
- full_prompt += f"[INST]{message}[/INST]"
346
-
347
- inputs = tokenizer(full_prompt, truncation=False, return_tensors="pt").to(device)
348
- context_length = inputs.input_ids.shape[-1]
349
-
350
- streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
351
-
352
- generate_kwargs = dict(
353
- inputs=inputs.input_ids,
354
- max_new_tokens=max_new_tokens,
355
- do_sample=True,
356
- top_p=top_p,
357
- top_k=top_k,
358
- temperature=temperature,
359
- num_beams=1,
360
- streamer=streamer,
361
- )
362
-
363
- thread = Thread(target=model.generate, kwargs=generate_kwargs)
364
- thread.start()
365
 
366
- buffer = ""
367
- for new_text in streamer:
368
- buffer += new_text
369
- yield buffer
370
 
371
- chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
372
-
373
- with gr.Blocks(css=CSS, theme="soft") as demo:
374
- gr.HTML(TITLE)
375
- gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
376
- gr.ChatInterface(
377
- fn=stream_chat,
378
- chatbot=chatbot,
379
- fill_height=True,
380
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
381
- additional_inputs=[
382
- gr.Textbox(
383
- value="You are a helpful assistant capable of generating long-form content.",
384
- label="System Prompt",
385
- render=False,
386
- ),
387
- gr.Slider(
388
- minimum=0,
389
- maximum=1,
390
- step=0.1,
391
- value=0.5,
392
- label="Temperature",
393
- render=False,
394
- ),
395
- gr.Slider(
396
- minimum=1024,
397
- maximum=32768,
398
- step=1024,
399
- value=32768,
400
- label="Max new tokens",
401
- render=False,
402
- ),
403
- gr.Slider(
404
- minimum=0.0,
405
- maximum=1.0,
406
- step=0.1,
407
- value=1.0,
408
- label="Top p",
409
- render=False,
410
- ),
411
- gr.Slider(
412
- minimum=1,
413
- maximum=100,
414
- step=1,
415
- value=50,
416
- label="Top k",
417
- render=False,
418
- ),
419
- ],
420
- # examples=[
421
- # ["Write a 5000-word comprehensive guide on machine learning for beginners."],
422
- # ["Create a detailed 3000-word business plan for a sustainable energy startup."],
423
- # ["Compose a 2000-word short story set in a futuristic underwater city."],
424
- # ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."],
425
- # ],
426
- # cache_examples=False,
427
- )
428
-
429
- if __name__ == "__main__":
430
- demo.launch()
431
 
 
 
 
 
 
 
 
 
 
 
 
432
 
 
433
 
434
- # ###OCT04 LLAMA3.2 Vision Model
435
- # from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
436
- # from PIL import Image
437
- # import requests
438
- # import torch
439
- # from threading import Thread
440
- # import gradio as gr
441
- # from gradio import FileData
442
- # import time
443
- # import os
444
- # import spaces
445
- # from huggingface_hub import login
446
- # login(token=os.getenv("HF_API_TOKEN"))
447
- # # ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
448
- # # model = MllamaForConditionalGeneration.from_pretrained(ckpt,
449
- # # torch_dtype=torch.bfloat16).to("cuda")
450
- # # processor = AutoProcessor.from_pretrained(ckpt)
451
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
 
453
- # # @spaces.GPU
454
- # # def bot_streaming(message, history, max_new_tokens=250):
455
-
456
- # # txt = message["text"]
457
- # # ext_buffer = f"{txt}"
458
-
459
- # # messages= []
460
- # # images = []
461
-
462
 
463
- # # for i, msg in enumerate(history):
464
- # # if isinstance(msg[0], tuple):
465
- # # messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
466
- # # messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
467
- # # images.append(Image.open(msg[0][0]).convert("RGB"))
468
- # # elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
469
- # # # messages are already handled
470
- # # pass
471
- # # elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
472
- # # messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
473
- # # messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
474
-
475
- # # # add current message
476
- # # if len(message["files"]) == 1:
477
-
478
- # # if isinstance(message["files"][0], str): # examples
479
- # # image = Image.open(message["files"][0]).convert("RGB")
480
- # # else: # regular input
481
- # # image = Image.open(message["files"][0]["path"]).convert("RGB")
482
- # # images.append(image)
483
- # # messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
484
- # # else:
485
- # # messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
486
-
487
-
488
- # # texts = processor.apply_chat_template(messages, add_generation_prompt=True)
489
-
490
- # # if images == []:
491
- # # inputs = processor(text=texts, return_tensors="pt").to("cuda")
492
- # # else:
493
- # # inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
494
- # # streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
495
-
496
- # # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
497
- # # generated_text = ""
498
-
499
- # # thread = Thread(target=model.generate, kwargs=generation_kwargs)
500
- # # thread.start()
501
- # # buffer = ""
502
-
503
- # # for new_text in streamer:
504
- # # buffer += new_text
505
- # # generated_text_without_prompt = buffer
506
- # # time.sleep(0.01)
507
- # # yield buffer
508
-
509
-
510
- # # demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama",
511
- # # textbox=gr.MultimodalTextbox(),
512
- # # additional_inputs = [gr.Slider(
513
- # # minimum=10,
514
- # # maximum=500,
515
- # # value=250,
516
- # # step=10,
517
- # # label="Maximum number of new tokens to generate",
518
- # # )
519
- # # ],
520
- # # cache_examples=False,
521
- # # description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
522
- # # stop_btn="Stop Generation",
523
- # # fill_height=True,
524
- # # multimodal=True)
525
-
526
- # # demo.launch(debug=True,live=True)
527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  # ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
529
- # model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 
530
  # processor = AutoProcessor.from_pretrained(ckpt)
531
 
 
532
  # @spaces.GPU
533
- # def bot_streaming(message, history, max_new_tokens=1000):
 
534
  # txt = message["text"]
535
  # ext_buffer = f"{txt}"
536
 
537
- # messages = []
538
  # images = []
539
 
540
- # # Process history messages
541
  # for i, msg in enumerate(history):
542
  # if isinstance(msg[0], tuple):
543
  # messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
544
  # messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
545
  # images.append(Image.open(msg[0][0]).convert("RGB"))
546
  # elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
547
- # pass # Previous messages already handled
548
- # elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # Text-only turn
 
549
  # messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
550
  # messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
551
 
552
- # # Add current message
553
  # if len(message["files"]) == 1:
554
- # if isinstance(message["files"][0], str): # Example images
 
555
  # image = Image.open(message["files"][0]).convert("RGB")
556
- # else: # Regular input
557
  # image = Image.open(message["files"][0]["path"]).convert("RGB")
558
  # images.append(image)
559
  # messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
560
  # else:
561
  # messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
562
 
563
- # # Prepare input for the model
564
  # texts = processor.apply_chat_template(messages, add_generation_prompt=True)
565
 
566
- # if not images:
567
  # inputs = processor(text=texts, return_tensors="pt").to("cuda")
568
  # else:
569
  # inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
570
-
571
  # streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
572
 
573
  # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
574
  # generated_text = ""
575
 
576
- # # Start text generation in a separate thread
577
  # thread = Thread(target=model.generate, kwargs=generation_kwargs)
578
  # thread.start()
579
  # buffer = ""
580
 
581
  # for new_text in streamer:
582
  # buffer += new_text
583
- # time.sleep(0.01) # Small delay to simulate streaming
 
584
  # yield buffer
585
 
586
- # # Gradio interface setup
587
- # demo = gr.ChatInterface(
588
- # fn=bot_streaming,
589
- # title="AreaX-Llama3.2-11B-Vision",
590
- # textbox=gr.MultimodalTextbox(),
591
- # additional_inputs=[
592
- # gr.Slider(
593
- # minimum=10,
594
- # maximum=500,
595
- # value=250,
596
- # step=10,
597
- # label="Maximum number of new tokens to generate",
598
- # )
599
- # ],
600
- # cache_examples=False,
601
- # description="Try AreaX Llama3.2-11B Vision Model by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply type your question.",
602
- # stop_btn="Stop Generation",
603
- # fill_height=True,
604
- # multimodal=True
605
- # )
606
 
607
- # demo.launch(debug=True,share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  ###########new clientkey 04 ruunng chlrhah
291
 
292
 
293
+ # import os
294
+ # import time
295
+ # import spaces
296
+ # import torch
297
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
298
+ # import gradio as gr
299
+ # from threading import Thread
300
 
301
+ # MODEL = "THUDM/LongWriter-llama3.1-8b"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
+ # TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>"
 
 
 
304
 
305
+ # PLACEHOLDER = """
306
+ # <center>
307
+ # <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p>
308
+ # </center>
309
+ # """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ # CSS = """
312
+ # .duplicate-button {
313
+ # margin: auto !important;
314
+ # color: white !important;
315
+ # background: black !important;
316
+ # border-radius: 100vh !important;
317
+ # }
318
+ # h3 {
319
+ # text-align: center;
320
+ # }
321
+ # """
322
 
323
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
324
 
325
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
326
+ # model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
327
+ # model = model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
+ # @spaces.GPU()
330
+ # def stream_chat(
331
+ # message: str,
332
+ # history: list,
333
+ # system_prompt: str,
334
+ # temperature: float = 0.5,
335
+ # max_new_tokens: int = 32768,
336
+ # top_p: float = 1.0,
337
+ # top_k: int = 50,
338
+ # ):
339
+ # print(f'message: {message}')
340
+ # print(f'history: {history}')
341
+
342
+ # full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
343
+ # for prompt, answer in history:
344
+ # full_prompt += f"[INST]{prompt}[/INST]{answer}"
345
+ # full_prompt += f"[INST]{message}[/INST]"
346
+
347
+ # inputs = tokenizer(full_prompt, truncation=False, return_tensors="pt").to(device)
348
+ # context_length = inputs.input_ids.shape[-1]
349
+
350
+ # streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
351
+
352
+ # generate_kwargs = dict(
353
+ # inputs=inputs.input_ids,
354
+ # max_new_tokens=max_new_tokens,
355
+ # do_sample=True,
356
+ # top_p=top_p,
357
+ # top_k=top_k,
358
+ # temperature=temperature,
359
+ # num_beams=1,
360
+ # streamer=streamer,
361
+ # )
362
+
363
+ # thread = Thread(target=model.generate, kwargs=generate_kwargs)
364
+ # thread.start()
365
+
366
+ # buffer = ""
367
+ # for new_text in streamer:
368
+ # buffer += new_text
369
+ # yield buffer
370
+
371
+ # chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
372
+
373
+ # with gr.Blocks(css=CSS, theme="soft") as demo:
374
+ # gr.HTML(TITLE)
375
+ # gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
376
+ # gr.ChatInterface(
377
+ # fn=stream_chat,
378
+ # chatbot=chatbot,
379
+ # fill_height=True,
380
+ # additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
381
+ # additional_inputs=[
382
+ # gr.Textbox(
383
+ # value="You are a helpful assistant capable of generating long-form content.",
384
+ # label="System Prompt",
385
+ # render=False,
386
+ # ),
387
+ # gr.Slider(
388
+ # minimum=0,
389
+ # maximum=1,
390
+ # step=0.1,
391
+ # value=0.5,
392
+ # label="Temperature",
393
+ # render=False,
394
+ # ),
395
+ # gr.Slider(
396
+ # minimum=1024,
397
+ # maximum=32768,
398
+ # step=1024,
399
+ # value=32768,
400
+ # label="Max new tokens",
401
+ # render=False,
402
+ # ),
403
+ # gr.Slider(
404
+ # minimum=0.0,
405
+ # maximum=1.0,
406
+ # step=0.1,
407
+ # value=1.0,
408
+ # label="Top p",
409
+ # render=False,
410
+ # ),
411
+ # gr.Slider(
412
+ # minimum=1,
413
+ # maximum=100,
414
+ # step=1,
415
+ # value=50,
416
+ # label="Top k",
417
+ # render=False,
418
+ # ),
419
+ # ],
420
+ # # examples=[
421
+ # # ["Write a 5000-word comprehensive guide on machine learning for beginners."],
422
+ # # ["Create a detailed 3000-word business plan for a sustainable energy startup."],
423
+ # # ["Compose a 2000-word short story set in a futuristic underwater city."],
424
+ # # ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."],
425
+ # # ],
426
+ # # cache_examples=False,
427
+ # )
428
+
429
+ # if __name__ == "__main__":
430
+ # demo.launch()
431
 
 
 
 
 
 
 
 
 
 
432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
+ # ###OCT04 LLAMA3.2 Vision Model
435
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
436
+ from PIL import Image
437
+ import requests
438
+ import torch
439
+ from threading import Thread
440
+ import gradio as gr
441
+ from gradio import FileData
442
+ import time
443
+ import os
444
+ import spaces
445
+ from huggingface_hub import login
446
+ login(token=os.getenv("HF_API_TOKEN"))
447
  # ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
448
+ # model = MllamaForConditionalGeneration.from_pretrained(ckpt,
449
+ # torch_dtype=torch.bfloat16).to("cuda")
450
  # processor = AutoProcessor.from_pretrained(ckpt)
451
 
452
+
453
  # @spaces.GPU
454
+ # def bot_streaming(message, history, max_new_tokens=250):
455
+
456
  # txt = message["text"]
457
  # ext_buffer = f"{txt}"
458
 
459
+ # messages= []
460
  # images = []
461
 
462
+
463
  # for i, msg in enumerate(history):
464
  # if isinstance(msg[0], tuple):
465
  # messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
466
  # messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
467
  # images.append(Image.open(msg[0][0]).convert("RGB"))
468
  # elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
469
+ # # messages are already handled
470
+ # pass
471
+ # elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
472
  # messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
473
  # messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
474
 
475
+ # # add current message
476
  # if len(message["files"]) == 1:
477
+
478
+ # if isinstance(message["files"][0], str): # examples
479
  # image = Image.open(message["files"][0]).convert("RGB")
480
+ # else: # regular input
481
  # image = Image.open(message["files"][0]["path"]).convert("RGB")
482
  # images.append(image)
483
  # messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
484
  # else:
485
  # messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
486
 
487
+
488
  # texts = processor.apply_chat_template(messages, add_generation_prompt=True)
489
 
490
+ # if images == []:
491
  # inputs = processor(text=texts, return_tensors="pt").to("cuda")
492
  # else:
493
  # inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
 
494
  # streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
495
 
496
  # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
497
  # generated_text = ""
498
 
 
499
  # thread = Thread(target=model.generate, kwargs=generation_kwargs)
500
  # thread.start()
501
  # buffer = ""
502
 
503
  # for new_text in streamer:
504
  # buffer += new_text
505
+ # generated_text_without_prompt = buffer
506
+ # time.sleep(0.01)
507
  # yield buffer
508
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
509
 
510
+ # demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama",
511
+ # textbox=gr.MultimodalTextbox(),
512
+ # additional_inputs = [gr.Slider(
513
+ # minimum=10,
514
+ # maximum=500,
515
+ # value=250,
516
+ # step=10,
517
+ # label="Maximum number of new tokens to generate",
518
+ # )
519
+ # ],
520
+ # cache_examples=False,
521
+ # description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
522
+ # stop_btn="Stop Generation",
523
+ # fill_height=True,
524
+ # multimodal=True)
525
+
526
+ # demo.launch(debug=True,live=True)
527
+
528
+ ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
529
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
530
+ processor = AutoProcessor.from_pretrained(ckpt)
531
+
532
+ @spaces.GPU
533
+ def bot_streaming(message, history, max_new_tokens=1000):
534
+ txt = message["text"]
535
+ ext_buffer = f"{txt}"
536
+
537
+ messages = []
538
+ images = []
539
+
540
+ # Process history messages
541
+ for i, msg in enumerate(history):
542
+ if isinstance(msg[0], tuple):
543
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
544
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
545
+ images.append(Image.open(msg[0][0]).convert("RGB"))
546
+ elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
547
+ pass # Previous messages already handled
548
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # Text-only turn
549
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
550
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
551
+
552
+ # Add current message
553
+ if len(message["files"]) == 1:
554
+ if isinstance(message["files"][0], str): # Example images
555
+ image = Image.open(message["files"][0]).convert("RGB")
556
+ else: # Regular input
557
+ image = Image.open(message["files"][0]["path"]).convert("RGB")
558
+ images.append(image)
559
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
560
+ else:
561
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
562
+
563
+ # Prepare input for the model
564
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
565
+
566
+ if not images:
567
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
568
+ else:
569
+ inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
570
+
571
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
572
+
573
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
574
+ generated_text = ""
575
+
576
+ # Start text generation in a separate thread
577
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
578
+ thread.start()
579
+ buffer = ""
580
+
581
+ for new_text in streamer:
582
+ buffer += new_text
583
+ time.sleep(0.01) # Small delay to simulate streaming
584
+ yield buffer
585
+
586
+ # Gradio interface setup
587
+ demo = gr.ChatInterface(
588
+ fn=bot_streaming,
589
+ title="AreaX-Llama3.2-11B-Vision",
590
+ textbox=gr.MultimodalTextbox(),
591
+ additional_inputs=[
592
+ gr.Slider(
593
+ minimum=10,
594
+ maximum=500,
595
+ value=250,
596
+ step=10,
597
+ label="Maximum number of new tokens to generate",
598
+ )
599
+ ],
600
+ cache_examples=False,
601
+ description="Try AreaX Llama3.2-11B Vision Model by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply type your question.",
602
+ stop_btn="Stop Generation",
603
+ fill_height=True,
604
+ multimodal=True
605
+ )
606
+
607
+ demo.launch(debug=True,share=True)