import re import torch from torchvision import transforms from transformers import BlipForConditionalGeneration, BlipProcessor from internals.util.commons import download_image class Image2Text: def load(self): self.processor = BlipProcessor.from_pretrained( "Salesforce/blip-image-captioning-large" ) self.model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-large", torch_dtype=torch.float16 ).to("cuda") def process(self, imageUrl: str) -> str: image = download_image(imageUrl).resize((512, 512)) inputs = self.processor.__call__(image, return_tensors="pt").to( "cuda", torch.float16 ) output_ids = self.model.generate( **inputs, do_sample=False, top_p=0.9, max_length=128 ) output_text = self.processor.batch_decode(output_ids) print(output_text) output_text = output_text[0] output_text = re.sub("|\\n|\[SEP\]", "", output_text) return output_text