Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,12 @@ import numpy as np
|
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
import torch
|
|
|
5 |
from PIL import Image
|
|
|
|
|
|
|
|
|
6 |
|
7 |
os.environ["WANDB_DISABLED"] = "true"
|
8 |
|
@@ -11,14 +16,81 @@ from transformers import (
|
|
11 |
AutoConfig,
|
12 |
AutoModelForSequenceClassification,
|
13 |
AutoTokenizer,
|
14 |
-
TrainingArguments,
|
15 |
logging,
|
16 |
-
pipeline
|
17 |
)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
id2label = {0: "negative", 1: "neutral", 2: "positive"}
|
20 |
label2id = {"negative": 0, "neutral": 1, "positive": 2}
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
model = AutoModelForSequenceClassification.from_pretrained(
|
23 |
"FFZG-cleopatra/M2SA",
|
24 |
num_labels=3, id2label=id2label,
|
@@ -28,11 +100,29 @@ model = AutoModelForSequenceClassification.from_pretrained(
|
|
28 |
|
29 |
def predict_sentiment(text, image):
|
30 |
print(text, image)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
prediction = None
|
32 |
with torch.no_grad():
|
33 |
-
model(
|
34 |
-
print(
|
35 |
-
|
36 |
return prediction
|
37 |
|
38 |
|
|
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
+
import torch.nn as nn
|
6 |
from PIL import Image
|
7 |
+
from transformers import CLIPModel, AutoModel
|
8 |
+
from typing import Optional
|
9 |
+
|
10 |
+
from safetensors.torch import load_model
|
11 |
|
12 |
os.environ["WANDB_DISABLED"] = "true"
|
13 |
|
|
|
16 |
AutoConfig,
|
17 |
AutoModelForSequenceClassification,
|
18 |
AutoTokenizer,
|
|
|
19 |
logging,
|
|
|
20 |
)
|
21 |
|
22 |
+
|
23 |
+
class VisionTextDualEncoderModel(nn.Module):
|
24 |
+
def __init__(self, num_classes):
|
25 |
+
super(VisionTextDualEncoderModel, self).__init__()
|
26 |
+
|
27 |
+
# Load the XLM-RoBERTa model
|
28 |
+
self.text_encoder = AutoModel.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
|
29 |
+
|
30 |
+
# Define your vision model (e.g., using torchvision)
|
31 |
+
self.vision_encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
32 |
+
|
33 |
+
vision_output_dim = self.vision_encoder.config.vision_config.hidden_size
|
34 |
+
|
35 |
+
|
36 |
+
# Combine the modalities
|
37 |
+
self.fc = nn.Linear(
|
38 |
+
self.text_encoder.config.hidden_size + vision_output_dim, num_classes
|
39 |
+
)
|
40 |
+
|
41 |
+
def forward(
|
42 |
+
self,
|
43 |
+
input_ids: Optional[torch.LongTensor] = None,
|
44 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
45 |
+
attention_mask: Optional[torch.Tensor] = None,
|
46 |
+
position_ids: Optional[torch.LongTensor] = None,
|
47 |
+
return_loss: Optional[bool] = None,
|
48 |
+
token_type_ids: Optional[torch.LongTensor] = None,
|
49 |
+
output_attentions: Optional[bool] = None,
|
50 |
+
output_hidden_states: Optional[bool] = None,
|
51 |
+
return_dict: Optional[bool] = None,
|
52 |
+
labels: Optional[torch.LongTensor] = None,
|
53 |
+
):
|
54 |
+
# Encode text inputs
|
55 |
+
text_outputs = self.text_encoder(
|
56 |
+
input_ids=input_ids,
|
57 |
+
attention_mask=attention_mask,
|
58 |
+
token_type_ids=token_type_ids,
|
59 |
+
position_ids=position_ids,
|
60 |
+
).pooler_output
|
61 |
+
|
62 |
+
# Encode vision inputs
|
63 |
+
vision_outputs = self.vision_encoder.vision_model(
|
64 |
+
pixel_values=pixel_values,
|
65 |
+
output_attentions=output_attentions,
|
66 |
+
output_hidden_states=output_hidden_states,
|
67 |
+
return_dict=return_dict,
|
68 |
+
)
|
69 |
+
|
70 |
+
# Concatenate text and vision features
|
71 |
+
combined_features = torch.cat(
|
72 |
+
(text_outputs, vision_outputs.pooler_output), dim=1
|
73 |
+
)
|
74 |
+
|
75 |
+
# Forward through a linear layer for classification
|
76 |
+
logits = self.fc(combined_features)
|
77 |
+
|
78 |
+
return {"logits": logits}
|
79 |
+
|
80 |
id2label = {0: "negative", 1: "neutral", 2: "positive"}
|
81 |
label2id = {"negative": 0, "neutral": 1, "positive": 2}
|
82 |
|
83 |
+
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
|
84 |
+
|
85 |
+
model = VisionTextDualEncoderModel(num_classes=3)
|
86 |
+
config = model.vision_text_model.config
|
87 |
+
|
88 |
+
# https://huggingface.co/FFZG-cleopatra/M2SA/blob/main/model.safetensors
|
89 |
+
sf_filename = hf_hub_download("FFZG-cleopatra/M2SA", filename="model.safetensors")
|
90 |
+
|
91 |
+
load_model(model,"model.safetensors") # model.load_state_dict(torch.load(model_args.model_name_or_path+"-finetuned/pytorch_model.bin"))
|
92 |
+
|
93 |
+
|
94 |
model = AutoModelForSequenceClassification.from_pretrained(
|
95 |
"FFZG-cleopatra/M2SA",
|
96 |
num_labels=3, id2label=id2label,
|
|
|
100 |
|
101 |
def predict_sentiment(text, image):
|
102 |
print(text, image)
|
103 |
+
text_inputs = tokenizer(
|
104 |
+
text,
|
105 |
+
max_length=512,
|
106 |
+
padding="max_length",
|
107 |
+
truncation=True,
|
108 |
+
)
|
109 |
+
|
110 |
+
image_transformations = Transform(
|
111 |
+
config.vision_config.image_size,
|
112 |
+
image_processor.image_mean,
|
113 |
+
image_processor.image_std,
|
114 |
+
)
|
115 |
+
image_transformations = torch.jit.script(image_transformations)
|
116 |
+
image = image_transformations(image)
|
117 |
+
model_input = {
|
118 |
+
"input_ids" : text_inputs.input_ids,
|
119 |
+
"pixel_values":image
|
120 |
+
"attention_mask" : text_inputs.attention_mask,
|
121 |
+
}
|
122 |
prediction = None
|
123 |
with torch.no_grad():
|
124 |
+
prediction = model(model_input)
|
125 |
+
print(prediction)
|
|
|
126 |
return prediction
|
127 |
|
128 |
|