File size: 4,451 Bytes
05e7387
 
 
 
 
 
 
 
106a6dd
 
 
 
 
 
7732f66
 
106a6dd
05e7387
106a6dd
05e7387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106a6dd
 
 
 
37204e2
 
c514b03
7732f66
106a6dd
37204e2
106a6dd
 
7732f66
37204e2
106a6dd
7732f66
106a6dd
7732f66
106a6dd
 
 
7732f66
 
c514b03
37204e2
8e769ae
 
 
37204e2
 
 
106a6dd
37204e2
c514b03
106a6dd
37204e2
 
 
106a6dd
 
37204e2
106a6dd
37204e2
bdc2a1b
106a6dd
37204e2
106a6dd
7732f66
106a6dd
37204e2
bdc2a1b
106a6dd
37204e2
106a6dd
7732f66
106a6dd
37204e2
bdc2a1b
106a6dd
37204e2
106a6dd
05e7387
c514b03
 
37204e2
 
 
 
 
 
c514b03
37204e2
 
 
 
 
 
 
 
c514b03
37204e2
 
 
 
 
 
c514b03
37204e2
 
7732f66
c514b03
bcd72e1
 
05e7387
c514b03
106a6dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
from PIL import Image
import requests
import torch
import spaces


title = """  # ๐Ÿ™‹๐Ÿปโ€โ™‚๏ธWelcome to Tonic's๐Ÿฆ…Falcon Vision๐Ÿ‘๏ธLanguage Model !
"""

description = """
Falcon2-11B-vlm is an 11B parameters causal decoder-only model built by TII and trained on over 5,000B tokens of RefinedWeb enhanced with curated corpora. To bring vision capabilities, , we integrate the pretrained CLIP ViT-L/14 vision encoder with our Falcon2-11B chat-finetuned model and train with image-text data. For enhancing the VLM's perception of fine-grained details w.r.t small objects in images, we employ a dynamic encoding mechanism at high-resolution for image inputs. 

### Join us : 
๐ŸŒŸTeamTonic๐ŸŒŸ is always making cool demos! Join our active builder's ๐Ÿ› ๏ธcommunity ๐Ÿ‘ป [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On ๐Ÿค—Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) Math ๐Ÿ” [introspector](https://huggingface.co/introspector) On ๐ŸŒGithub: [Tonic-AI](https://github.com/tonic-ai) & contribute to๐ŸŒŸ [MultiTonic](https://github.com/multitonic/)๐Ÿค—Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant ๐Ÿค—
"""

processor = LlavaNextProcessor.from_pretrained("tiiuae/falcon-11B-vlm", tokenizer_class='PreTrainedTokenizerFast')
model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm", torch_dtype=torch.bfloat16).to('cuda:0')


@spaces.GPU
def generate_paragraph(image_url):

    cats_image = Image.open(requests.get(image_url, stream=True).raw)
    instruction = 'Write a long paragraph about this picture.'
    prompt = f"User:<image>\n{instruction} Falcon:"
    inputs = processor(prompt, images=cats_image, return_tensors="pt", padding=True).to('cuda:0')
    output = model.generate(**inputs, max_new_tokens=256)
    generated_captions = processor.decode(output[0], skip_special_tokens=True).strip()

    return generated_captions

def set_and_generate(url):
    generated_paragraph = generate_paragraph(url)
    return url, generated_paragraph

# Create the Gradio Blocks interface

with gr.Blocks(css=".thumbnail { width: 150px; height: 150px; object-fit: cover; }") as demo:

    gr.Markdown(title)

    gr.Markdown(description)



    with gr.Row():

        with gr.Column():

            image_url_input = gr.Textbox(label="Image URL")
            generate_button = gr.Button("Generate Paragraph")



            # Image thumbnails acting as buttons

            example_1 = gr.Button("Types of Falcons", elem_id="example_1")
            example_2 = gr.Button("Camel Racing - Saudi Arabia", elem_id="example_2")
            example_3 = gr.Button("Urban Scene - India", elem_id="example_3")

        

        with gr.Column():

            generated_paragraph_output = gr.Textbox(label="๐Ÿฆ…Falcon Vision๐Ÿ‘๏ธ")



    # Wire click events
    generate_button.click(generate_paragraph, inputs=image_url_input, outputs=generated_paragraph_output)


    example_1.click(

        lambda: set_and_generate("https://www.animalspot.net/wp-content/uploads/2020/01/Types-of-Falcons.jpg"), 
        outputs=[image_url_input, generated_paragraph_output]

    )

    example_2.click(

        lambda: set_and_generate("https://www.leaders-mena.com/leaders/uploads/2023/01/The-Traditional-Camel-Racing-In-Saudi-Arabia-Unique-Sport-Activity-1024x576.jpg"), 
        outputs=[image_url_input, generated_paragraph_output]

    )

    example_3.click(

        lambda: set_and_generate("http://embed.robertharding.com/embed/1161-4342.jpg"), 
        outputs=[image_url_input, generated_paragraph_output]

    )



# Configure the CSS for thumbnails

demo.css += """

#example_1 {

    background: url("https://www.animalspot.net/wp-content/uploads/2020/01/Types-of-Falcons.jpg") no-repeat center center;
    background-size: cover;

}



#example_2 {

    background: url("https://www.leaders-mena.com/leaders/uploads/2023/01/The-Traditional-Camel-Racing-In-Saudi-Arabia-Unique-Sport-Activity-1024x576.jpg") no-repeat center center;
    background-size: cover;

}

#example_3 {

    background: url("http://embed.robertharding.com/embed/1161-4342.jpg") no-repeat center center;
    background-size: cover;
}

"""


# Launch the Gradio interface

demo.launch()