Spaces:
No application file
No application file
File size: 4,823 Bytes
6755a2d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import time
from typing import Union, List, Tuple
from tqdm import tqdm
from PIL import Image
from transformers import (
BertForSequenceClassification,
BertTokenizer,
CLIPProcessor,
CLIPModel,
)
import torch
import numpy as np
from numpy import ndarray
from moviepy.editor import VideoFileClip
from ..utils.path_util import get_video_signature
from ..video_map.video_map import VideoMap
from ..data.video_dataset import MoviepyVideoDataset, SequentialDataset
from ...utils.itertools_util import generate_sample_idxs
class ClipVisionFeatureExtractor(object):
def __init__(self, model_name: str, local_file: bool = True, device: str = "cpu"):
if device:
self.device = device
else:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.clip_model = (
CLIPModel.from_pretrained(model_name, local_files_only=local_file)
.eval()
.to(self.device)
)
self.processor = CLIPProcessor.from_pretrained(
model_name, local_files_only=local_file
)
def image(self, img_paths):
image = self.processor(
images=[Image.open(i) for i in img_paths], return_tensors="pt"
).to(self.device)
with torch.no_grad():
image_features = self.clip_model.get_image_features(**image)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
return image_features.detach().cpu().numpy()
def __call__(self, image):
return self.image(image)
def predict_images(
self, image: Union[Image.Image, List[Image.Image], torch.Tensor]
) -> np.ndarray:
if isinstance(image, str):
image = [image]
if isinstance(image, list) and isinstance(image[0], str):
image = [Image.open(i) for i in image]
image = self.processor(images=image, return_tensors="pt").to(self.device)
with torch.no_grad():
image_features = self.clip_model.get_image_features(**image)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
return image_features.detach().cpu().numpy()
def predict_clip(
self, clip: Union[Image.Image, List[Image.Image], torch.Tensor], batch_size: int
) -> ndarray:
features = []
num = len(clip)
windows = generate_sample_idxs(
num, window_size=batch_size, step=batch_size, drop_last=False
)
for i, window in enumerate(windows):
sub_clip = clip[window]
feature = self.predict_images(sub_clip)
features.append(feature)
feature = np.concatenate(features, axis=0)
return features
def predict_video(
video: Union[str, SequentialDataset],
video_map: VideoMap,
vf_extractor,
bbx_extr,
time_size: int = None,
step: int = None,
overlap: int = None,
sample_rate: int = None,
drop_last: bool = False,
max_frame_num_per_clip: int = 5,
):
# prepare video
if isinstance(video, str):
video = MoviepyVideoDataset(
video,
time_size=time_size,
step=step,
overlap=overlap,
drop_last=drop_last,
sample_rate=sample_rate,
)
if video_map.meta_info.content_box != video.content_box:
video.content_box = video_map.content_box
fps = 1
max_frame_num = 5
select_frame_idx = []
select_frame_clip = []
for i in range(len(video_map.clipseq)):
clip = video_map.clipseq[i]
if clip["cliptype"] == "transition":
continue
select_frame_num = int(min(np.ceil(clip["duration"] * fps), max_frame_num))
clip_total_frame_num = clip["frame_end"] - clip["frame_start"]
frame_duration = clip_total_frame_num // (select_frame_num + 1)
for j in range(select_frame_num):
select_frame_idx.append(clip["frame_start"] + (j + 1) * frame_duration)
select_frame_clip.append(i)
return video_map
class TaiyiVisionFeatureExtractor(ClipVisionFeatureExtractor):
def __init__(
self,
model_name: str = "clip-vit-large-patch14",
local_file: bool = True,
device: str = "cpu",
):
"""_summary_
Args:
model_name (str, optional): clip-vit-large-patch14 or openai/clip-vit-large-patch14. Defaults to "clip-vit-large-patch14".
local_file (bool, optional): _description_. Defaults to True.
device (str, optional): _description_. Defaults to 'cpu'.
"""
super().__init__(model_name, local_file, device)
|