import random import torch from torch import nn import torch.nn.functional as F import ffmpeg import numpy as np import cv2 from moviepy.editor import VideoFileClip from .utils import get_frames class TransNetV2(nn.Module): def __init__(self, F=16, L=3, S=2, D=1024): super(TransNetV2, self).__init__() self.SDDCNN = nn.ModuleList( [ StackedDDCNNV2( in_filters=3, n_blocks=S, filters=F, stochastic_depth_drop_prob=0.0 ) ] + [ StackedDDCNNV2( in_filters=(F * 2 ** (i - 1)) * 4, n_blocks=S, filters=F * 2**i ) for i in range(1, L) ] ) # 帧相似网络 self.frame_sim_layer = FrameSimilarity( sum([(F * 2**i) * 4 for i in range(L)]), lookup_window=101, output_dim=128, similarity_dim=128, use_bias=True, ) # 颜色相似网络 self.color_hist_layer = ColorHistograms(lookup_window=101, output_dim=128) # dropout self.dropout = nn.Dropout(0.5) output_dim = ((F * 2 ** (L - 1)) * 4) * 3 * 6 # output_dim = output_dim + 128 # 使用了帧相似网络, 维度需要加128 output_dim = output_dim + 128 # 使用了颜色相似网络, 维度需要再加128 self.fc1 = nn.Linear(output_dim, D) self.cls_layer1 = nn.Linear(D, 1) self.cls_layer2 = nn.Linear(D, 1) def forward(self, inputs): # 输入必须为torch.uint8, (h,w)=(27,48)的图片batch样本 # assert isinstance(inputs, torch.Tensor) and list(inputs.shape[2:]) == [27, 48, 3] and inputs.dtype == torch.uint8, "incorrect input type and/or shape" # uint8 of shape [B, T, H, W, 3] to float of shape [B, 3, T, H, W] with torch.autograd.set_detect_anomaly(True): x = inputs.permute([0, 4, 1, 2, 3]).float() x = x.div_(255.0) # 收集每一层的SDDCNN特征图 block_features = [] for block in self.SDDCNN: x = block(x) block_features.append(x) x = x.permute(0, 2, 3, 4, 1) # 把维度从[B, 通道数, T, H, W] 转化为 [B, T, H, W, 通道数] x = x.reshape(x.shape[0], x.shape[1], -1) x = [self.frame_sim_layer(block_features), x], 2 ) # 在最后一维度cat上block_features输出的特征 x = [self.color_hist_layer(inputs), x], 2 ) # 在最后一维度cat上color_hist_layer输出的特征 x = F.relu(self.fc1(x)) x = self.dropout(x) one_hot = self.cls_layer1(x) many_hot = self.cls_layer2(x) return one_hot, many_hot # 预测MP4文件转换帧,并给出对应帧位置 def predict_video( self, mp4_file, cache_path="", c_box=None, width=48, height=27, input_frames=100, overlap=30, sample_fps=30, threshold=0.3, ): """ mp4_file: ~/6712566330782010632.mp4 cache_path: ~/视频单帧数据_h48_w27 return: [x,x,...] 点位时间 """ assert overlap % 2 == 0 assert input_frames > overlap # fps = eval(ffmpeg.probe(mp4_file)['streams'][0]['r_frame_rate']) # 获取视频的视频帧率 # total_frames = int(ffmpeg.probe(mp4_file)['streams'][0]['nb_frames']) # 获取视频的总帧数 # duration = float(ffmpeg.probe(mp4_file)['streams'][0]['duration']) # 获取视频的总时长 video = VideoFileClip(mp4_file) # video = video.subclip(0, 60 * 10) fps = video.fps duration = video.duration total_frames = int(duration * fps) w, h = video.size print(fps, duration, total_frames, w, h) if c_box: video.crop(*c_box) frame_iter = video.iter_frames(fps=sample_fps) sample_total_frames = int(sample_fps * duration) frame_list = [] for i in range(sample_total_frames // (input_frames - overlap) + 1): # if i==1: # break frame_list = frame_list[-overlap:] start_frame = i * (input_frames - overlap) end_frame = min(start_frame + input_frames, sample_total_frames) print("start_frame & end_frame: ", start_frame, end_frame) for frame in frame_iter: frame = cv2.resize(frame, (width, height)) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_list.append(frame) if len(frame_list) == end_frame - start_frame: break frames = torch.Tensor(frame_list) # 获得帧 if frames.shape[0] < end_frame - start_frame: # 原视频的视频时长比音频时长短,体现出来的是原视频最后有声音没画面 print( "total_frames is wrong: ", total_frames, "-->", start_frame + frames.shape[0], ) # sample_total_frames = start_frame + frames.shape[0] # fps = total_frames / duration frames = frames.cuda() # single_frame_pred和all_frame_pred都是输出window_size长的是否转场概率, single_frame_pred, all_frame_pred = self.forward( frames.unsqueeze(0) ) # 前向推理 # single_frame_pred = F.softmax(single_frame_pred, dim=-1) # 获得每一帧对应的类别概率 # single_frame_pred = torch.argmax(single_frame_pred, dim=-1).reshape(-1) single_frame_pred = torch.sigmoid(single_frame_pred).reshape(-1) all_frame_pred = torch.sigmoid(all_frame_pred).reshape(-1) # single_frame_pred = (single_frame_pred>threshold)*1 if total_frames > end_frame: if i == 0: single_frame_pred_label = single_frame_pred[: -overlap // 2] all_frame_pred_label = all_frame_pred[: -overlap // 2] else: single_frame_pred_label = ( single_frame_pred_label, single_frame_pred[overlap // 2 : -overlap // 2], ), dim=0, ) all_frame_pred_label = ( all_frame_pred_label, all_frame_pred[overlap // 2 : -overlap // 2], ), dim=0, ) else: if i == 0: single_frame_pred_label = single_frame_pred all_frame_pred_label = all_frame_pred else: single_frame_pred_label = (single_frame_pred_label, single_frame_pred[overlap // 2 :]), dim=0, ) all_frame_pred_label = (all_frame_pred_label, all_frame_pred[overlap // 2 :]), dim=0 ) break single_frame_pred_label = single_frame_pred_label.cpu().numpy() all_frame_pred_label = all_frame_pred_label.cpu().numpy() return ( single_frame_pred_label, all_frame_pred_label, fps, total_frames, duration, h, w, ) # transition_index = torch.where(pred_label==1)[0].cpu().numpy() # 转场帧位置 # transition_index = transition_index.astype(np.float) # # 对返回结果做后处理合并相邻帧 # result_transition = [] # for i, transition in enumerate(transition_index): # if i == 0: # result_transition.append([transition]) # else: # if abs(result_transition[-1][-1]-transition) == 1: # result_transition[-1].append(transition) # else: # result_transition.append([transition]) # # result_transition = [[0]] + [[item[0], item[-1]] if len(item)>1 else [item[0]] for item in result_transition] + [[total_frames]] # # return result_transition, fps, total_frames, duration, h, w def predict_video_2( self, mp4_file, cache_path="", c_box=None, width=48, height=27, input_frames=100, overlap=30, sample_fps=30, threshold=0.3, ): """ mp4_file: ~/6712566330782010632.mp4 cache_path: ~/视频单帧数据_h48_w27 return: [x,x,...] 点位时间 """ assert overlap % 2 == 0 assert input_frames > overlap # fps = eval(ffmpeg.probe(mp4_file)['streams'][0]['r_frame_rate']) # 获取视频的视频帧率 # total_frames = int(ffmpeg.probe(mp4_file)['streams'][0]['nb_frames']) # 获取视频的总帧数 # duration = float(ffmpeg.probe(mp4_file)['streams'][0]['duration']) # 获取视频的总时长 video = VideoFileClip(mp4_file) # video = video.subclip(0, 60 * 10) fps = video.fps duration = video.duration total_frames = int(duration * fps) w, h = video.size print(fps, duration, total_frames, w, h) if c_box: video.crop(*c_box) frame_iter = video.iter_frames(fps=sample_fps) sample_total_frames = int(sample_fps * duration) frame_list = [] for i in range(sample_total_frames // (input_frames - overlap) + 1): # if i==1: # break frame_list = frame_list[-overlap:] start_frame = i * (input_frames - overlap) end_frame = min(start_frame + input_frames, sample_total_frames) print("start_frame & end_frame: ", start_frame, end_frame) for frame in frame_iter: frame = cv2.resize(frame, (width, height)) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame_list.append(frame) if len(frame_list) == end_frame - start_frame: break frames = torch.Tensor(frame_list) # 获得帧 if frames.shape[0] < end_frame - start_frame: # 原视频的视频时长比音频时长短,体现出来的是原视频最后有声音没画面 print( "total_frames is wrong: ", total_frames, "-->", start_frame + frames.shape[0], ) # sample_total_frames = start_frame + frames.shape[0] # fps = total_frames / duration frames = frames.cuda() single_frame_pred, all_frame_pred = self.forward( frames.unsqueeze(0) ) # 前向推理 # single_frame_pred = F.softmax(single_frame_pred, dim=-1) # 获得每一帧对应的类别概率 # single_frame_pred = torch.argmax(single_frame_pred, dim=-1).reshape(-1) single_frame_pred = torch.sigmoid(single_frame_pred).reshape(-1) all_frame_pred = torch.sigmoid(all_frame_pred).reshape(-1) # single_frame_pred = (single_frame_pred>threshold)*1 if total_frames > end_frame: if i == 0: single_frame_pred_label = single_frame_pred[: -overlap // 2] all_frame_pred_label = all_frame_pred[: -overlap // 2] else: single_frame_pred_label = ( single_frame_pred_label, single_frame_pred[overlap // 2 : -overlap // 2], ), dim=0, ) all_frame_pred_label = ( all_frame_pred_label, all_frame_pred[overlap // 2 : -overlap // 2], ), dim=0, ) else: if i == 0: single_frame_pred_label = single_frame_pred all_frame_pred_label = all_frame_pred else: single_frame_pred_label = (single_frame_pred_label, single_frame_pred[overlap // 2 :]), dim=0, ) all_frame_pred_label = (all_frame_pred_label, all_frame_pred[overlap // 2 :]), dim=0 ) break single_frame_pred_label = single_frame_pred_label.cpu().numpy() all_frame_pred_label = all_frame_pred_label.cpu().numpy() return ( single_frame_pred_label, all_frame_pred_label, fps, total_frames, duration, h, w, ) class StackedDDCNNV2(nn.Module): def __init__( self, in_filters, n_blocks, filters, shortcut=True, pool_type="avg", stochastic_depth_drop_prob=0.0, ): super(StackedDDCNNV2, self).__init__() self.shortcut = shortcut # 定义DDCNN层 self.DDCNN = nn.ModuleList( [ DilatedDCNNV2( in_filters if i == 1 else filters * 4, filters, activation=F.relu if i != n_blocks else None, ) for i in range(1, n_blocks + 1) ] ) # 有n_blocks层数量的DilateDCNNV2模块 # 定义pool层 self.pool = ( nn.MaxPool3d(kernel_size=(1, 2, 2)) if pool_type == "max" else nn.AvgPool3d(kernel_size=(1, 2, 2)) ) self.stochastic_depth_drop_prob = stochastic_depth_drop_prob def forward(self, inputs): x = inputs shortcut = None # DDCNN层前向传播 for block in self.DDCNN: x = block(x) if shortcut is None: # 记录第一层的结果作为残差连接 shortcut = x x = F.relu(x) if self.shortcut is not None: if self.stochastic_depth_drop_prob != 0.0: if if random.random() < self.stochastic_depth_drop_prob: x = shortcut else: x = x + shortcut else: x = (1 - self.stochastic_depth_drop_prob) * x + shortcut else: x = x + shortcut x = self.pool(x) return x class DilatedDCNNV2(nn.Module): def __init__(self, in_filters, filters, batch_norm=True, activation=None): super(DilatedDCNNV2, self).__init__() self.Conv3D_1 = Conv3DConfigurable( in_filters, filters, 1, use_bias=not batch_norm ) self.Conv3D_2 = Conv3DConfigurable( in_filters, filters, 2, use_bias=not batch_norm ) self.Conv3D_4 = Conv3DConfigurable( in_filters, filters, 4, use_bias=not batch_norm ) self.Conv3D_8 = Conv3DConfigurable( in_filters, filters, 8, use_bias=not batch_norm ) = nn.BatchNorm3d(filters * 4, eps=1e-3) if batch_norm else None self.activation = activation # 激活函数定义 def forward(self, inputs): conv1 = self.Conv3D_1(inputs) conv2 = self.Conv3D_2(inputs) conv3 = self.Conv3D_4(inputs) conv4 = self.Conv3D_8(inputs) x =[conv1, conv2, conv3, conv4], dim=1) if is not None: x = if self.activation is not None: x = self.activation(x) return x class Conv3DConfigurable(nn.Module): def __init__( self, in_filters, filters, dilation_rate, separable=True, use_bias=True ): super(Conv3DConfigurable, self).__init__() if separable: # (2+1)D convolution conv1 = nn.Conv3d( in_filters, 2 * filters, kernel_size=(1, 3, 3), dilation=(1, 1, 1), padding=(0, 1, 1), bias=False, ) conv2 = nn.Conv3d( 2 * filters, filters, kernel_size=(3, 1, 1), dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 0, 0), bias=use_bias, ) self.layers = nn.ModuleList([conv1, conv2]) else: conv = nn.Conv3d( in_filters, filters, kernel_size=3, dilation=(dilation_rate, 1, 1), padding=(dilation_rate, 1, 1), bias=use_bias, ) self.layers = nn.ModuleList([conv]) def forward(self, inputs): x = inputs for layer in self.layers: x = layer(x) return x # 帧相似网络构建 class FrameSimilarity(nn.Module): def __init__( self, in_filters, similarity_dim=128, lookup_window=101, output_dim=128, use_bias=False, ): super(FrameSimilarity, self).__init__() self.projection = nn.Linear(in_filters, similarity_dim, bias=use_bias) self.fc = nn.Linear(lookup_window, output_dim) self.lookup_window = lookup_window assert lookup_window % 2 == 1, "`lookup_window` must be odd integer" def forward(self, inputs): x =[torch.mean(x, dim=[3, 4]) for x in inputs], dim=1) x = torch.transpose(x, 1, 2) x = self.projection(x) x = F.normalize(x, p=2, dim=2) batch_size, time_window = x.shape[0], x.shape[1] similarities = torch.bmm( x, x.transpose(1, 2) ) # [batch_size, time_window, time_window]余弦相似度 similarities_padded = F.pad( similarities, [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2] ) batch_indices = ( torch.arange(0, batch_size, device=x.device) .view([batch_size, 1, 1]) .repeat([1, time_window, self.lookup_window]) ) time_indices = ( torch.arange(0, time_window, device=x.device) .view([1, time_window, 1]) .repeat([batch_size, 1, self.lookup_window]) ) lookup_indices = ( torch.arange(0, self.lookup_window, device=x.device) .view([1, 1, self.lookup_window]) .repeat([batch_size, time_window, 1]) + time_indices ) similarities = similarities_padded[batch_indices, time_indices, lookup_indices] return F.relu(self.fc(similarities)) # 颜色相似网络 class ColorHistograms(nn.Module): def __init__(self, lookup_window=101, output_dim=None): super(ColorHistograms, self).__init__() self.fc = ( nn.Linear(lookup_window, output_dim) if output_dim is not None else None ) self.lookup_window = lookup_window assert lookup_window % 2 == 1, "`lookup_window` must be odd integer" @staticmethod def compute_color_histograms(frames): frames = def get_bin(frames): # returns 0 .. 511 R, G, B = frames[:, :, 0], frames[:, :, 1], frames[:, :, 2] R, G, B = R >> 5, G >> 5, B >> 5 return (R << 6) + (G << 3) + B batch_size, time_window, height, width, no_channels = frames.shape assert no_channels == 3 frames_flatten = frames.view(batch_size * time_window, height * width, 3) binned_values = get_bin(frames_flatten) frame_bin_prefix = ( torch.arange(0, batch_size * time_window, device=frames.device) << 9 ).view(-1, 1) binned_values = (binned_values + frame_bin_prefix).view(-1) histograms = torch.zeros( batch_size * time_window * 512, dtype=torch.int32, device=frames.device ) histograms.scatter_add_( 0, binned_values, torch.ones(len(binned_values), dtype=torch.int32, device=frames.device), ) histograms = histograms.view(batch_size, time_window, 512).float() histograms_normalized = F.normalize(histograms, p=2, dim=2) return histograms_normalized def forward(self, inputs): x = self.compute_color_histograms(inputs) batch_size, time_window = x.shape[0], x.shape[1] similarities = torch.bmm( x, x.transpose(1, 2) ) # [batch_size, time_window, time_window] similarities_padded = F.pad( similarities, [(self.lookup_window - 1) // 2, (self.lookup_window - 1) // 2] ) batch_indices = ( torch.arange(0, batch_size, device=x.device) .view([batch_size, 1, 1]) .repeat([1, time_window, self.lookup_window]) ) time_indices = ( torch.arange(0, time_window, device=x.device) .view([1, time_window, 1]) .repeat([batch_size, 1, self.lookup_window]) ) lookup_indices = ( torch.arange(0, self.lookup_window, device=x.device) .view([1, 1, self.lookup_window]) .repeat([batch_size, time_window, 1]) + time_indices ) similarities = similarities_padded[batch_indices, time_indices, lookup_indices] if self.fc is not None: return F.relu(self.fc(similarities)) return similarities