taesiri commited on
Commit
3ccdd83
·
1 Parent(s): f93e53d
Files changed (1) hide show
  1. app.py +15 -4
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import torch
 
2
  import gradio as gr
3
  from transformers import CLIPProcessor, CLIPModel
4
  import spaces
@@ -34,18 +35,28 @@ def calculate_score(image, text, model_name):
34
  inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
35
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
36
 
37
- # Calculate scores
38
  with torch.no_grad():
39
  outputs = model(**inputs)
 
 
40
 
41
- logits_per_image = outputs.logits_per_image.cpu().numpy()
 
 
42
 
43
- results_dict = {label: float(score) for label, score in zip(labels, logits_per_image[0])}
 
 
 
 
 
 
44
  return results_dict
45
 
46
  with gr.Blocks() as demo:
47
  gr.Markdown("# Multi-Model CLIP Score")
48
- gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text using different CLIP model variants")
49
 
50
  with gr.Row():
51
  image_input = gr.Image(type="pil")
 
1
  import torch
2
+ import torch.nn.functional as F
3
  import gradio as gr
4
  from transformers import CLIPProcessor, CLIPModel
5
  import spaces
 
35
  inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
36
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
37
 
38
+ # Calculate embeddings
39
  with torch.no_grad():
40
  outputs = model(**inputs)
41
+ image_embeds = outputs.image_embeds
42
+ text_embeds = outputs.text_embeds
43
 
44
+ # Normalize embeddings
45
+ image_embeds = F.normalize(image_embeds, p=2, dim=1)
46
+ text_embeds = F.normalize(text_embeds, p=2, dim=1)
47
 
48
+ # Calculate cosine similarity
49
+ cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
50
+
51
+ # Convert to percentages
52
+ percentages = ((cosine_similarities + 1) / 2 * 100).cpu().numpy()
53
+
54
+ results_dict = {label: float(score) for label, score in zip(labels, percentages)}
55
  return results_dict
56
 
57
  with gr.Blocks() as demo:
58
  gr.Markdown("# Multi-Model CLIP Score")
59
+ gr.Markdown("Calculate the CLIP score (cosine similarity) between the given image and text descriptions using different CLIP model variants")
60
 
61
  with gr.Row():
62
  image_input = gr.Image(type="pil")