thak123 commited on
Commit
738ac11
1 Parent(s): ae9293a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -5
app.py CHANGED
@@ -2,7 +2,12 @@ import numpy as np
2
  import os
3
  import gradio as gr
4
  import torch
 
5
  from PIL import Image
 
 
 
 
6
 
7
  os.environ["WANDB_DISABLED"] = "true"
8
 
@@ -11,14 +16,81 @@ from transformers import (
11
  AutoConfig,
12
  AutoModelForSequenceClassification,
13
  AutoTokenizer,
14
- TrainingArguments,
15
  logging,
16
- pipeline
17
  )
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  id2label = {0: "negative", 1: "neutral", 2: "positive"}
20
  label2id = {"negative": 0, "neutral": 1, "positive": 2}
21
 
 
 
 
 
 
 
 
 
 
 
 
22
  model = AutoModelForSequenceClassification.from_pretrained(
23
  "FFZG-cleopatra/M2SA",
24
  num_labels=3, id2label=id2label,
@@ -28,11 +100,29 @@ model = AutoModelForSequenceClassification.from_pretrained(
28
 
29
  def predict_sentiment(text, image):
30
  print(text, image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  prediction = None
32
  with torch.no_grad():
33
- model(x)
34
- print(analyzer(x))
35
-
36
  return prediction
37
 
38
 
 
2
  import os
3
  import gradio as gr
4
  import torch
5
+ import torch.nn as nn
6
  from PIL import Image
7
+ from transformers import CLIPModel, AutoModel
8
+ from typing import Optional
9
+
10
+ from safetensors.torch import load_model
11
 
12
  os.environ["WANDB_DISABLED"] = "true"
13
 
 
16
  AutoConfig,
17
  AutoModelForSequenceClassification,
18
  AutoTokenizer,
 
19
  logging,
 
20
  )
21
 
22
+
23
+ class VisionTextDualEncoderModel(nn.Module):
24
+ def __init__(self, num_classes):
25
+ super(VisionTextDualEncoderModel, self).__init__()
26
+
27
+ # Load the XLM-RoBERTa model
28
+ self.text_encoder = AutoModel.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
29
+
30
+ # Define your vision model (e.g., using torchvision)
31
+ self.vision_encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
32
+
33
+ vision_output_dim = self.vision_encoder.config.vision_config.hidden_size
34
+
35
+
36
+ # Combine the modalities
37
+ self.fc = nn.Linear(
38
+ self.text_encoder.config.hidden_size + vision_output_dim, num_classes
39
+ )
40
+
41
+ def forward(
42
+ self,
43
+ input_ids: Optional[torch.LongTensor] = None,
44
+ pixel_values: Optional[torch.FloatTensor] = None,
45
+ attention_mask: Optional[torch.Tensor] = None,
46
+ position_ids: Optional[torch.LongTensor] = None,
47
+ return_loss: Optional[bool] = None,
48
+ token_type_ids: Optional[torch.LongTensor] = None,
49
+ output_attentions: Optional[bool] = None,
50
+ output_hidden_states: Optional[bool] = None,
51
+ return_dict: Optional[bool] = None,
52
+ labels: Optional[torch.LongTensor] = None,
53
+ ):
54
+ # Encode text inputs
55
+ text_outputs = self.text_encoder(
56
+ input_ids=input_ids,
57
+ attention_mask=attention_mask,
58
+ token_type_ids=token_type_ids,
59
+ position_ids=position_ids,
60
+ ).pooler_output
61
+
62
+ # Encode vision inputs
63
+ vision_outputs = self.vision_encoder.vision_model(
64
+ pixel_values=pixel_values,
65
+ output_attentions=output_attentions,
66
+ output_hidden_states=output_hidden_states,
67
+ return_dict=return_dict,
68
+ )
69
+
70
+ # Concatenate text and vision features
71
+ combined_features = torch.cat(
72
+ (text_outputs, vision_outputs.pooler_output), dim=1
73
+ )
74
+
75
+ # Forward through a linear layer for classification
76
+ logits = self.fc(combined_features)
77
+
78
+ return {"logits": logits}
79
+
80
  id2label = {0: "negative", 1: "neutral", 2: "positive"}
81
  label2id = {"negative": 0, "neutral": 1, "positive": 2}
82
 
83
+ tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
84
+
85
+ model = VisionTextDualEncoderModel(num_classes=3)
86
+ config = model.vision_text_model.config
87
+
88
+ # https://huggingface.co/FFZG-cleopatra/M2SA/blob/main/model.safetensors
89
+ sf_filename = hf_hub_download("FFZG-cleopatra/M2SA", filename="model.safetensors")
90
+
91
+ load_model(model,"model.safetensors") # model.load_state_dict(torch.load(model_args.model_name_or_path+"-finetuned/pytorch_model.bin"))
92
+
93
+
94
  model = AutoModelForSequenceClassification.from_pretrained(
95
  "FFZG-cleopatra/M2SA",
96
  num_labels=3, id2label=id2label,
 
100
 
101
  def predict_sentiment(text, image):
102
  print(text, image)
103
+ text_inputs = tokenizer(
104
+ text,
105
+ max_length=512,
106
+ padding="max_length",
107
+ truncation=True,
108
+ )
109
+
110
+ image_transformations = Transform(
111
+ config.vision_config.image_size,
112
+ image_processor.image_mean,
113
+ image_processor.image_std,
114
+ )
115
+ image_transformations = torch.jit.script(image_transformations)
116
+ image = image_transformations(image)
117
+ model_input = {
118
+ "input_ids" : text_inputs.input_ids,
119
+ "pixel_values":image
120
+ "attention_mask" : text_inputs.attention_mask,
121
+ }
122
  prediction = None
123
  with torch.no_grad():
124
+ prediction = model(model_input)
125
+ print(prediction)
 
126
  return prediction
127
 
128