taesiri commited on
Commit
61b7eee
·
verified ·
1 Parent(s): 73f9f45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -15
app.py CHANGED
@@ -3,45 +3,81 @@ import gradio as gr
3
  from transformers import CLIPProcessor, CLIPModel
4
  import spaces
5
 
6
- model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to("cuda")
7
- processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  @spaces.GPU
10
- def calculate_score(image, text):
11
  labels = text.split(";")
12
  labels = [l.strip() for l in labels]
13
  labels = list(filter(None, labels))
14
  if len(labels) == 0:
15
  return dict()
16
 
 
 
 
 
 
 
 
17
  inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
18
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
19
 
20
- outputs = model(**inputs)
21
- logits_per_image = outputs.logits_per_image.detach().cpu().numpy()
 
 
 
22
 
23
  results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
24
  return results_dict
25
 
26
  with gr.Blocks() as demo:
27
- gr.Markdown("# CLIP Score")
28
- gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text")
29
 
30
  with gr.Row():
31
- image_input = gr.Image()
32
  output_label = gr.Label()
33
 
34
- text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
 
 
 
 
 
 
 
35
 
36
  image_input.change(
37
- fn=calculate_score,
38
- inputs=[image_input, text_input],
39
  outputs=output_label
40
  )
41
 
42
  text_input.submit(
43
- fn=calculate_score,
44
- inputs=[image_input, text_input],
 
 
 
 
 
 
45
  outputs=output_label
46
  )
47
 
@@ -50,10 +86,11 @@ with gr.Blocks() as demo:
50
  [
51
  "cat.jpg",
52
  "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
 
53
  ]
54
  ],
55
- fn=calculate_score,
56
- inputs=[image_input, text_input],
57
  outputs=output_label,
58
  )
59
 
 
3
  from transformers import CLIPProcessor, CLIPModel
4
  import spaces
5
 
6
+ # Dictionary of available CLIP models with their image sizes
7
+ CLIP_MODELS = {
8
+ "ViT-B/32": ("openai/clip-vit-base-patch32", 224),
9
+ "ViT-B/16": ("openai/clip-vit-base-patch16", 224),
10
+ "ViT-L/14": ("openai/clip-vit-large-patch14", 224),
11
+ "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
12
+ }
13
+
14
+ # Initialize models and processors
15
+ models = {}
16
+ processors = {}
17
+
18
+ for model_name, (model_path, _) in CLIP_MODELS.items():
19
+ models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
20
+ processors[model_name] = CLIPProcessor.from_pretrained(model_path)
21
 
22
  @spaces.GPU
23
+ def calculate_score(image, text, model_name):
24
  labels = text.split(";")
25
  labels = [l.strip() for l in labels]
26
  labels = list(filter(None, labels))
27
  if len(labels) == 0:
28
  return dict()
29
 
30
+ model = models[model_name]
31
+ processor = processors[model_name]
32
+
33
+ # Get the correct image size for the model
34
+ _, image_size = CLIP_MODELS[model_name]
35
+
36
+ # Preprocess the image and text
37
  inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
38
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
39
 
40
+ # Calculate scores
41
+ with torch.no_grad():
42
+ outputs = model(**inputs)
43
+
44
+ logits_per_image = outputs.logits_per_image.cpu().numpy()
45
 
46
  results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
47
  return results_dict
48
 
49
  with gr.Blocks() as demo:
50
+ gr.Markdown("# Multi-Model CLIP Score")
51
+ gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants")
52
 
53
  with gr.Row():
54
+ image_input = gr.Image(type="pil")
55
  output_label = gr.Label()
56
 
57
+ with gr.Row():
58
+ text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
59
+ model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
60
+
61
+ def process_inputs(image, text, model_name):
62
+ if image is None or text.strip() == "":
63
+ return None
64
+ return calculate_score(image, text, model_name)
65
 
66
  image_input.change(
67
+ fn=process_inputs,
68
+ inputs=[image_input, text_input, model_dropdown],
69
  outputs=output_label
70
  )
71
 
72
  text_input.submit(
73
+ fn=process_inputs,
74
+ inputs=[image_input, text_input, model_dropdown],
75
+ outputs=output_label
76
+ )
77
+
78
+ model_dropdown.change(
79
+ fn=process_inputs,
80
+ inputs=[image_input, text_input, model_dropdown],
81
  outputs=output_label
82
  )
83
 
 
86
  [
87
  "cat.jpg",
88
  "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
89
+ "ViT-B/16"
90
  ]
91
  ],
92
+ fn=process_inputs,
93
+ inputs=[image_input, text_input, model_dropdown],
94
  outputs=output_label,
95
  )
96