Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,19 +1,21 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
|
4 |
-
|
5 |
from PIL import Image
|
6 |
-
|
7 |
import requests
|
8 |
-
|
9 |
import torch
|
10 |
-
|
11 |
import spaces
|
12 |
|
13 |
-
# Load the processor and model
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
17 |
model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm", torch_dtype=torch.bfloat16).to('cuda:0')
|
18 |
|
19 |
|
@@ -21,45 +23,50 @@ model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm
|
|
21 |
def generate_paragraph(image_url):
|
22 |
|
23 |
cats_image = Image.open(requests.get(image_url, stream=True).raw)
|
24 |
-
|
25 |
instruction = 'Write a long paragraph about this picture.'
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
prompt = f"User:<image>\n{instruction} Falcon:"
|
30 |
-
|
31 |
inputs = processor(prompt, images=cats_image, return_tensors="pt", padding=True).to('cuda:0')
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
output = model.generate(**inputs, max_new_tokens=256)
|
36 |
-
|
37 |
generated_captions = processor.decode(output[0], skip_special_tokens=True).strip()
|
38 |
|
39 |
-
|
40 |
-
|
41 |
return generated_captions
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
)
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
# Launch the Gradio interface
|
64 |
-
|
65 |
-
interface.launch()
|
|
|
1 |
import gradio as gr
|
|
|
2 |
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
|
|
|
3 |
from PIL import Image
|
|
|
4 |
import requests
|
|
|
5 |
import torch
|
|
|
6 |
import spaces
|
7 |
|
|
|
8 |
|
9 |
+
title = """ # ๐๐ปโโ๏ธWelcome to Tonic's๐ฆ
Falcon Vision๐๏ธLanguage Model !
|
10 |
+
"""
|
11 |
+
|
12 |
+
description = """
|
13 |
+
Falcon2-11B-vlm is an 11B parameters causal decoder-only model built by TII and trained on over 5,000B tokens of RefinedWeb enhanced with curated corpora. To bring vision capabilities, , we integrate the pretrained CLIP ViT-L/14 vision encoder with our Falcon2-11B chat-finetuned model and train with image-text data. For enhancing the VLM's perception of fine-grained details w.r.t small objects in images, we employ a dynamic encoding mechanism at high-resolution for image inputs.
|
14 |
+
|
15 |
+
Join us : ๐TeamTonic๐ is always making cool demos! Join our active builder's ๐ ๏ธcommunity ๐ป [](https://discord.gg/GWpVpekp) On ๐คHuggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) Math ๐ [introspector](https://huggingface.co/introspector) On ๐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to๐ [MultiTonic](https://github.com/multitonic/)๐คBig thanks to Yuvi Sharma and all the folks at huggingface for the community grant ๐ค
|
16 |
+
"""
|
17 |
|
18 |
+
processor = LlavaNextProcessor.from_pretrained("tiiuae/falcon-11B-vlm", tokenizer_class='PreTrainedTokenizerFast')
|
19 |
model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm", torch_dtype=torch.bfloat16).to('cuda:0')
|
20 |
|
21 |
|
|
|
23 |
def generate_paragraph(image_url):
|
24 |
|
25 |
cats_image = Image.open(requests.get(image_url, stream=True).raw)
|
|
|
26 |
instruction = 'Write a long paragraph about this picture.'
|
|
|
|
|
|
|
27 |
prompt = f"User:<image>\n{instruction} Falcon:"
|
|
|
28 |
inputs = processor(prompt, images=cats_image, return_tensors="pt", padding=True).to('cuda:0')
|
|
|
|
|
|
|
29 |
output = model.generate(**inputs, max_new_tokens=256)
|
|
|
30 |
generated_captions = processor.decode(output[0], skip_special_tokens=True).strip()
|
31 |
|
|
|
|
|
32 |
return generated_captions
|
33 |
|
34 |
+
# Function to set the URL and generate the paragraph
|
35 |
+
def set_and_generate(url):
|
36 |
+
generated_paragraph = generate_paragraph(url)
|
37 |
+
return url, generated_paragraph
|
38 |
+
|
39 |
+
# Create the Gradio Blocks interface
|
40 |
+
with gr.Blocks() as demo:
|
41 |
+
gr.Markdown(title)
|
42 |
+
gr.Markdown(description)
|
43 |
+
|
44 |
+
with gr.Row():
|
45 |
+
with gr.Column():
|
46 |
+
image_url_input = gr.Textbox(label="Image URL")
|
47 |
+
generate_button = gr.Button("Generate Paragraph")
|
48 |
+
|
49 |
+
example_1 = gr.Button("Example 1")
|
50 |
+
example_2 = gr.Button("Example 2")
|
51 |
+
example_3 = gr.Button("Example 3")
|
52 |
+
|
53 |
+
with gr.Column():
|
54 |
+
generated_paragraph_output = gr.Textbox(label="Generated Paragraph")
|
55 |
+
|
56 |
+
generate_button.click(generate_paragraph, inputs=image_url_input, outputs=generated_paragraph_output)
|
57 |
+
|
58 |
+
example_1.click(
|
59 |
+
lambda: set_and_generate("https://example.com/image1.jpg"),
|
60 |
+
outputs=[image_url_input, generated_paragraph_output]
|
61 |
+
)
|
62 |
+
example_2.click(
|
63 |
+
lambda: set_and_generate("https://example.com/image2.jpg"),
|
64 |
+
outputs=[image_url_input, generated_paragraph_output]
|
65 |
+
)
|
66 |
+
example_3.click(
|
67 |
+
lambda: set_and_generate("https://example.com/image3.jpg"),
|
68 |
+
outputs=[image_url_input, generated_paragraph_output]
|
69 |
+
)
|
70 |
|
71 |
# Launch the Gradio interface
|
72 |
+
demo.launch()
|
|