Spaces:
Runtime error
Runtime error
File size: 5,720 Bytes
d7174bf 70b16bd d7174bf c22438d 70b16bd 0833e94 738ac11 c22438d fc66215 2fb7e31 de597c3 738ac11 c22438d d7174bf 738ac11 70b16bd 2e53777 0833e94 70b16bd 5bc82d4 738ac11 0833e94 738ac11 846b6b6 738ac11 320f206 738ac11 e999bb0 70b16bd 05e22ed 0833e94 23c497b de597c3 2fb7e31 17c3752 738ac11 caf7d56 738ac11 1e55b12 738ac11 163556f 5e349a3 7843225 0833e94 ffc2eef 738ac11 0833e94 70b16bd 6e8de0e 935135a 70b16bd 7843225 70b16bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
from typing import Optional
import os
os.environ["WANDB_DISABLED"] = "true"
import numpy as np
from PIL import Image
import gradio as gr
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import CenterCrop, ConvertImageDtype, Normalize, Resize
from torchvision.transforms.functional import InterpolationMode
from torchvision import transforms
from torchvision.io import ImageReadMode, read_image
from transformers import CLIPModel, AutoModel
from huggingface_hub import hf_hub_download
from safetensors.torch import load_model
from datasets import load_dataset, load_metric
from transformers import (
AutoConfig,
AutoImageProcessor,
AutoModelForSequenceClassification,
AutoTokenizer,
logging,
)
class Transform(torch.nn.Module):
def __init__(self, image_size, mean, std):
super().__init__()
self.transforms = torch.nn.Sequential(
Resize([image_size], interpolation=InterpolationMode.BICUBIC),
CenterCrop(image_size),
ConvertImageDtype(torch.float),
Normalize(mean, std),
)
def forward(self, x) -> torch.Tensor:
"""`x` should be an instance of `PIL.Image.Image`"""
with torch.no_grad():
x = self.transforms(x)
return x
class VisionTextDualEncoderModel(nn.Module):
def __init__(self, num_classes):
super(VisionTextDualEncoderModel, self).__init__()
# Load the XLM-RoBERTa model
self.text_encoder = AutoModel.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
# Define your vision model (e.g., using torchvision)
self.vision_encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
vision_output_dim = self.vision_encoder.config.vision_config.hidden_size
# Combine the modalities
self.fc = nn.Linear(
self.text_encoder.config.hidden_size + vision_output_dim, num_classes
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
pixel_values: Optional[torch.FloatTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
return_loss: Optional[bool] = None,
token_type_ids: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
labels: Optional[torch.LongTensor] = None,
):
# Encode text inputs
text_outputs = self.text_encoder(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
).pooler_output
# Encode vision inputs
vision_outputs = self.vision_encoder.vision_model(
pixel_values=pixel_values,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# Concatenate text and vision features
combined_features = torch.cat(
(text_outputs, vision_outputs.pooler_output), dim=1
)
# Forward through a linear layer for classification
logits = self.fc(combined_features)
return {"logits": logits}
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual")
model = VisionTextDualEncoderModel(num_classes=3)
config = model.vision_encoder.config
# https://huggingface.co/FFZG-cleopatra/M2SA/blob/main/model.safetensors
sf_filename = hf_hub_download("FFZG-cleopatra/M2SA", filename="model.safetensors")
load_model(model, sf_filename)
# model.load_state_dict(torch.load(model_args.model_name_or_path+"-finetuned/pytorch_model.bin"))
# model = AutoModelForSequenceClassification.from_pretrained(
# "FFZG-cleopatra/M2SA",
# num_labels=3, id2label=id2label,
# label2id=label2id
# )
image_processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
def predict_sentiment(text, image):
print(text, image)
print(dir(image))
image = read_image(image, mode=ImageReadMode.RGB)
# image = transforms.ToTensor()(image).unsqueeze(0)
text_inputs = tokenizer(
text,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
image_transformations = Transform(
config.vision_config.image_size,
image_processor.image_mean,
image_processor.image_std,
)
image_transformations = torch.jit.script(image_transformations)
image = image_transformations(image)
model_input = {
"input_ids" : text_inputs.input_ids,
"pixel_values":image,
"attention_mask" : text_inputs.attention_mask,
}
print(text_inputs)
print(image)
print(model_input)
prediction = None
with torch.no_grad():
prediction = model(input_ids=text_inputs.input_ids,attention_mask=text_inputs.attention_mask, pixel_values=image)
print(prediction)
return prediction
interface = gr.Interface(
fn=lambda text, image: predict_sentiment(text, image),
inputs=[gr.Textbox(),gr.Image(type="filepath")],
outputs=['text'],
title='Multilingual-Multimodal-Sentiment-Analysis',
examples= [["I am enjoying","A_Sep20_14_1189155141.jpg"]],
description='Get the positive/neutral/negative sentiment for the given input.'
)
interface.launch(inline = False)
|