Spaces:
Sleeping
Sleeping
Upload 11 files
Browse files- lipreading/dataloaders.py +64 -0
- lipreading/dataset.py +176 -0
- lipreading/mixup.py +28 -0
- lipreading/model.py +153 -0
- lipreading/models/resnet.py +135 -0
- lipreading/models/resnet1D.py +143 -0
- lipreading/models/shufflenetv2.py +178 -0
- lipreading/models/tcn.py +255 -0
- lipreading/optim_utils.py +31 -0
- lipreading/preprocess.py +188 -0
- lipreading/utils.py +203 -0
lipreading/dataloaders.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from lipreading.preprocess import *
|
4 |
+
from lipreading.dataset import MyDataset, pad_packed_collate
|
5 |
+
|
6 |
+
|
7 |
+
def get_preprocessing_pipelines(modality='video'):
|
8 |
+
# -- preprocess for the video stream
|
9 |
+
preprocessing = {}
|
10 |
+
# -- LRW config
|
11 |
+
if modality == 'video':
|
12 |
+
crop_size = (88, 88)
|
13 |
+
(mean, std) = (0.421, 0.165)
|
14 |
+
# train :
|
15 |
+
preprocessing['train'] = Compose([ # 여러 개의 preprocess를 사용할 때 Compose()를 사용한다. preprocess.py에 설정되어 있음
|
16 |
+
Normalize(0.0,255.0),
|
17 |
+
RandomCrop(crop_size),
|
18 |
+
HorizontalFlip(0.5),
|
19 |
+
Normalize(mean, std) ])
|
20 |
+
|
21 |
+
preprocessing['val'] = Compose([
|
22 |
+
Normalize( 0.0,255.0 ),
|
23 |
+
CenterCrop(crop_size),
|
24 |
+
Normalize(mean, std) ])
|
25 |
+
|
26 |
+
preprocessing['test'] = preprocessing['val'] # test와 val이 같다
|
27 |
+
|
28 |
+
elif modality == 'raw_audio':
|
29 |
+
|
30 |
+
preprocessing['train'] = Compose([
|
31 |
+
AddNoise( noise=np.load('./data/babbleNoise_resample_16K.npy')), # train에만 노이즈를 추가해 준다.
|
32 |
+
NormalizeUtterance()])
|
33 |
+
|
34 |
+
preprocessing['val'] = NormalizeUtterance() # z-score 정규화를 수행
|
35 |
+
preprocessing['test'] = NormalizeUtterance()
|
36 |
+
|
37 |
+
return preprocessing
|
38 |
+
|
39 |
+
|
40 |
+
def get_data_loaders(args):
|
41 |
+
preprocessing = get_preprocessing_pipelines( args.modality)
|
42 |
+
|
43 |
+
# create dataset object for each partition
|
44 |
+
dsets = {partition: MyDataset(
|
45 |
+
modality=args.modality,
|
46 |
+
data_partition=partition,
|
47 |
+
data_dir=args.data_dir,
|
48 |
+
label_fp=args.label_path,
|
49 |
+
annonation_direc=args.annonation_direc,
|
50 |
+
preprocessing_func=preprocessing[partition],
|
51 |
+
data_suffix='.npz'
|
52 |
+
) for partition in ['train', 'val', 'test']}
|
53 |
+
|
54 |
+
dset_loaders = {x: torch.utils.data.DataLoader(
|
55 |
+
dsets[x],
|
56 |
+
batch_size=args.batch_size,
|
57 |
+
shuffle=True,
|
58 |
+
collate_fn=pad_packed_collate,
|
59 |
+
pin_memory=True,
|
60 |
+
num_workers=args.workers,
|
61 |
+
worker_init_fn=np.random.seed(1)) for x in ['train', 'val', 'test']}
|
62 |
+
|
63 |
+
return dset_loaders
|
64 |
+
|
lipreading/dataset.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import torch
|
4 |
+
import random
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
import sys
|
8 |
+
from lipreading.utils import read_txt_lines
|
9 |
+
|
10 |
+
|
11 |
+
# dataloaders.py에서 사용된 MyDataset
|
12 |
+
# dsets = {partition: MyDataset(
|
13 |
+
# modality=args.modality,
|
14 |
+
# data_partition=partition,
|
15 |
+
# data_dir=args.data_dir,
|
16 |
+
# label_fp=args.label_path,
|
17 |
+
# annonation_direc=args.annonation_direc,
|
18 |
+
# preprocessing_func=preprocessing[partition],
|
19 |
+
# data_suffix='.npz'
|
20 |
+
# ) for partition in ['train', 'val', 'test']}
|
21 |
+
|
22 |
+
|
23 |
+
class MyDataset(object):
|
24 |
+
|
25 |
+
def __init__(self, modality, data_partition, data_dir, label_fp, annonation_direc=None,
|
26 |
+
preprocessing_func=None, data_suffix='.npz'):
|
27 |
+
assert os.path.isfile( label_fp ), "File path provided for the labels does not exist. Path iput: {}".format(label_fp)
|
28 |
+
self._data_partition = data_partition
|
29 |
+
self._data_dir = data_dir
|
30 |
+
self._data_suffix = data_suffix
|
31 |
+
|
32 |
+
self._label_fp = label_fp
|
33 |
+
self._annonation_direc = annonation_direc
|
34 |
+
|
35 |
+
self.fps = 25 if modality == "video" else 16000
|
36 |
+
self.is_var_length = True
|
37 |
+
self.label_idx = -3
|
38 |
+
|
39 |
+
self.preprocessing_func = preprocessing_func
|
40 |
+
|
41 |
+
self._data_files = []
|
42 |
+
|
43 |
+
self.load_dataset()
|
44 |
+
|
45 |
+
|
46 |
+
def load_dataset(self):
|
47 |
+
|
48 |
+
# -- read the labels file
|
49 |
+
self._labels = read_txt_lines(self._label_fp)
|
50 |
+
|
51 |
+
# -- add examples to self._data_files
|
52 |
+
self._get_files_for_partition()
|
53 |
+
|
54 |
+
# -- from self._data_files to self.list
|
55 |
+
self.list = dict()
|
56 |
+
self.instance_ids = dict()
|
57 |
+
|
58 |
+
for i, x in enumerate(self._data_files):
|
59 |
+
label = self._get_label_from_path( x )
|
60 |
+
self.list[i] = [ x, self._labels.index( label ) ]
|
61 |
+
self.instance_ids[i] = self._get_instance_id_from_path( x )
|
62 |
+
|
63 |
+
print('Partition {} loaded'.format(self._data_partition))
|
64 |
+
|
65 |
+
def _get_instance_id_from_path(self, x):
|
66 |
+
# for now this works for npz/npys, might break for image folders
|
67 |
+
instance_id = x.split('/')[-1]
|
68 |
+
return os.path.splitext( instance_id )[0]
|
69 |
+
|
70 |
+
def _get_label_from_path(self, x):
|
71 |
+
return x.split('/')[self.label_idx]
|
72 |
+
|
73 |
+
def _get_files_for_partition(self): ##### 여기 확인!!
|
74 |
+
# get rgb/mfcc file paths
|
75 |
+
|
76 |
+
dir_fp = self._data_dir
|
77 |
+
if not dir_fp:
|
78 |
+
return
|
79 |
+
|
80 |
+
# get npy/npz/mp4 files
|
81 |
+
search_str_npz = os.path.join(dir_fp, '*', self._data_partition, '*.npz') # npz : 여러개의 리스트를 한번에 저장하기 위한 포맷
|
82 |
+
search_str_npy = os.path.join(dir_fp, '*', self._data_partition, '*.npy') # npy : 하나의 numpy array를 저장하기 위한 포맷
|
83 |
+
search_str_mp4 = os.path.join(dir_fp, '*', self._data_partition, '*.mp4')
|
84 |
+
self._data_files.extend( glob.glob( search_str_npz ) ) # list.extend() : npz파일명을 _data_files에 추가한다.
|
85 |
+
self._data_files.extend( glob.glob( search_str_npy ) ) # list.extend() : npy파일명을 _data_files에 추가한다.
|
86 |
+
self._data_files.extend( glob.glob( search_str_mp4 ) ) # list.extend() : mp4파일명을 _data_files에 추가한다.
|
87 |
+
|
88 |
+
# If we are not using the full set of labels, remove examples for labels not used
|
89 |
+
self._data_files = [ f for f in self._data_files if f.split('/')[self.label_idx] in self._labels ]
|
90 |
+
|
91 |
+
|
92 |
+
def load_data(self, filename):
|
93 |
+
|
94 |
+
try:
|
95 |
+
if filename.endswith('npz'): # endswith(문자열) : 해당 문자열로 끝나는지 여부를 true/false로 반환
|
96 |
+
# return np.load(filename, allow_pickle=True)['data']
|
97 |
+
return np.load(filename)['data']
|
98 |
+
elif filename.endswith('mp4'):
|
99 |
+
return librosa.load(filename, sr=16000)[0][-19456:]
|
100 |
+
# librosa.load() : wav파일을 읽을 때 사용. librosa로 데이터를 읽으면 범위가 -1 ~ 1로 정규화 된다.
|
101 |
+
# sr : sampling rate (주파수 분석 및 파형의 시간 간격을 결정)
|
102 |
+
# 비디오의 경우 : 1초에 보이는 프레임이 몇 개인가
|
103 |
+
# 오디오의 경우 : 프레임이 아닌 샘플이라고 부른다. 단위는 Hz
|
104 |
+
# sr이 높은 것이 음질이 좋다.
|
105 |
+
# https://wiserloner.tistory.com/1194
|
106 |
+
# 16,000 Hz : 표준 전화 협대역인 8,000 Hz보다 높은 광대역 주파수 확장. VoIP
|
107 |
+
else:
|
108 |
+
return np.load(filename)
|
109 |
+
except IOError:
|
110 |
+
print("Error when reading file: {}".format(filename))
|
111 |
+
sys.exit()
|
112 |
+
|
113 |
+
def _apply_variable_length_aug(self, filename, raw_data):
|
114 |
+
# read info txt file (to see duration of word, to be used to do temporal cropping)
|
115 |
+
info_txt = os.path.join(self._annonation_direc, *filename.split('/')[self.label_idx:] ) # swap base folder
|
116 |
+
info_txt = os.path.splitext( info_txt )[0] + '.txt' # swap extension
|
117 |
+
info = read_txt_lines(info_txt)
|
118 |
+
|
119 |
+
utterance_duration = float( info[4].split(' ')[1] )
|
120 |
+
half_interval = int(utterance_duration/2.0 * self.fps) # num frames of utterance / 2
|
121 |
+
|
122 |
+
n_frames = raw_data.shape[0]
|
123 |
+
mid_idx = ( n_frames -1 ) // 2 # video has n frames, mid point is (n-1)//2 as count starts with 0
|
124 |
+
left_idx = random.randint(0, max(0,mid_idx-half_interval-1)) # random.randint(a,b) chooses in [a,b]
|
125 |
+
right_idx = random.randint(min( mid_idx+half_interval+1, n_frames ), n_frames)
|
126 |
+
|
127 |
+
return raw_data[left_idx:right_idx]
|
128 |
+
|
129 |
+
|
130 |
+
def __getitem__(self, idx):
|
131 |
+
|
132 |
+
raw_data = self.load_data(self.list[idx][0])
|
133 |
+
|
134 |
+
# -- perform variable length on training set
|
135 |
+
if ( self._data_partition == 'train' ) and self.is_var_length:
|
136 |
+
data = self._apply_variable_length_aug(self.list[idx][0], raw_data)
|
137 |
+
else:
|
138 |
+
data = raw_data
|
139 |
+
|
140 |
+
preprocess_data = self.preprocessing_func(data)
|
141 |
+
label = self.list[idx][1]
|
142 |
+
|
143 |
+
return preprocess_data, label
|
144 |
+
|
145 |
+
|
146 |
+
def __len__(self):
|
147 |
+
return len(self._data_files)
|
148 |
+
|
149 |
+
|
150 |
+
def pad_packed_collate(batch):
|
151 |
+
|
152 |
+
batch = np.array(batch, dtype=object) # list 라서 numpy 로 변경, 내부 요소 리스트 길이가 달라서 dytpe=object 설정하는 코드 추가
|
153 |
+
|
154 |
+
if len(batch) == 1:
|
155 |
+
data, lengths, labels_np, = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
|
156 |
+
data = torch.FloatTensor(data)
|
157 |
+
lengths = [data.size(1)]
|
158 |
+
|
159 |
+
if len(batch) > 1:
|
160 |
+
data_list, lengths, labels_np = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
|
161 |
+
|
162 |
+
data_np = 0 # data_np 변수 초기화하는 코드 추가
|
163 |
+
|
164 |
+
if data_list[0].ndim == 3:
|
165 |
+
max_len, h, w = data_list[0].shape # since it is sorted, the longest video is the first one
|
166 |
+
data_np = np.zeros(( len(data_list), max_len, h, w))
|
167 |
+
elif data_list[0].ndim == 1:
|
168 |
+
max_len = data_list[0].shape[0]
|
169 |
+
data_np = np.zeros( (len(data_list), max_len))
|
170 |
+
for idx in range( len(data_np)):
|
171 |
+
data_np[idx][:data_list[idx].shape[0]] = data_list[idx]
|
172 |
+
data = torch.FloatTensor(data_np)
|
173 |
+
|
174 |
+
labels = torch.LongTensor(labels_np)
|
175 |
+
|
176 |
+
return data, lengths, labels
|
lipreading/mixup.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
|
5 |
+
# -- mixup data augmentation # mixup augmentation 계산
|
6 |
+
# from https://github.com/hongyi-zhang/mixup/blob/master/cifar/utils.py
|
7 |
+
def mixup_data(x, y, alpha=1.0, soft_labels = None, use_cuda=False):
|
8 |
+
'''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda'''
|
9 |
+
|
10 |
+
if alpha > 0.:
|
11 |
+
lam = np.random.beta(alpha, alpha) # 베타 분포에서 표본 추출
|
12 |
+
else:
|
13 |
+
lam = 1.
|
14 |
+
|
15 |
+
batch_size = x.size()[0]
|
16 |
+
if use_cuda:
|
17 |
+
index = torch.randperm(batch_size).cuda() # 주어진 범위 내의 정수를 랜덤하게 생성 # tensor 를 gpu 에 할당
|
18 |
+
else:
|
19 |
+
index = torch.randperm(batch_size) # 주어진 범위 내의 정수를 랜덤하게 생성
|
20 |
+
|
21 |
+
mixed_x = lam * x + (1 - lam) * x[index,:]
|
22 |
+
y_a, y_b = y, y[index]
|
23 |
+
return mixed_x, y_a, y_b, lam
|
24 |
+
|
25 |
+
|
26 |
+
# mixup 적용
|
27 |
+
def mixup_criterion(y_a, y_b, lam):
|
28 |
+
return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
|
lipreading/model.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import math
|
4 |
+
import numpy as np
|
5 |
+
from lipreading.models.resnet import ResNet, BasicBlock
|
6 |
+
from lipreading.models.resnet1D import ResNet1D, BasicBlock1D
|
7 |
+
from lipreading.models.shufflenetv2 import ShuffleNetV2
|
8 |
+
from lipreading.models.tcn import MultibranchTemporalConvNet, TemporalConvNet
|
9 |
+
|
10 |
+
|
11 |
+
# -- auxiliary functions
|
12 |
+
def threeD_to_2D_tensor(x):
|
13 |
+
n_batch, n_channels, s_time, sx, sy = x.shape
|
14 |
+
x = x.transpose(1, 2)
|
15 |
+
return x.reshape(n_batch*s_time, n_channels, sx, sy)
|
16 |
+
|
17 |
+
|
18 |
+
def _average_batch(x, lengths, B):
|
19 |
+
return torch.stack( [torch.mean( x[index][:,0:i], 1 ) for index, i in enumerate(lengths)],0 )
|
20 |
+
|
21 |
+
|
22 |
+
class MultiscaleMultibranchTCN(nn.Module):
|
23 |
+
def __init__(self, input_size, num_channels, num_classes, tcn_options, dropout, relu_type, dwpw=False):
|
24 |
+
super(MultiscaleMultibranchTCN, self).__init__()
|
25 |
+
|
26 |
+
self.kernel_sizes = tcn_options['kernel_size']
|
27 |
+
self.num_kernels = len( self.kernel_sizes )
|
28 |
+
|
29 |
+
self.mb_ms_tcn = MultibranchTemporalConvNet(input_size, num_channels, tcn_options, dropout=dropout, relu_type=relu_type, dwpw=dwpw)
|
30 |
+
self.tcn_output = nn.Linear(num_channels[-1], num_classes)
|
31 |
+
|
32 |
+
self.consensus_func = _average_batch
|
33 |
+
|
34 |
+
def forward(self, x, lengths, B):
|
35 |
+
# x needs to have dimension (N, C, L) in order to be passed into CNN
|
36 |
+
xtrans = x.transpose(1, 2)
|
37 |
+
out = self.mb_ms_tcn(xtrans)
|
38 |
+
out = self.consensus_func( out, lengths, B )
|
39 |
+
return self.tcn_output(out)
|
40 |
+
|
41 |
+
|
42 |
+
class TCN(nn.Module):
|
43 |
+
"""Implements Temporal Convolutional Network (TCN)
|
44 |
+
__https://arxiv.org/pdf/1803.01271.pdf
|
45 |
+
"""
|
46 |
+
|
47 |
+
def __init__(self, input_size, num_channels, num_classes, tcn_options, dropout, relu_type, dwpw=False):
|
48 |
+
super(TCN, self).__init__()
|
49 |
+
self.tcn_trunk = TemporalConvNet(input_size, num_channels, dropout=dropout, tcn_options=tcn_options, relu_type=relu_type, dwpw=dwpw)
|
50 |
+
self.tcn_output = nn.Linear(num_channels[-1], num_classes)
|
51 |
+
|
52 |
+
self.consensus_func = _average_batch
|
53 |
+
|
54 |
+
self.has_aux_losses = False
|
55 |
+
|
56 |
+
def forward(self, x, lengths, B):
|
57 |
+
# x needs to have dimension (N, C, L) in order to be passed into CNN
|
58 |
+
x = self.tcn_trunk(x.transpose(1, 2))
|
59 |
+
x = self.consensus_func( x, lengths, B )
|
60 |
+
return self.tcn_output(x)
|
61 |
+
|
62 |
+
|
63 |
+
class Lipreading(nn.Module):
|
64 |
+
def __init__( self, modality='video', hidden_dim=256, backbone_type='resnet', num_classes=30,
|
65 |
+
relu_type='prelu', tcn_options={}, width_mult=1.0, extract_feats=False):
|
66 |
+
super(Lipreading, self).__init__()
|
67 |
+
self.extract_feats = extract_feats
|
68 |
+
self.backbone_type = backbone_type
|
69 |
+
self.modality = modality
|
70 |
+
|
71 |
+
if self.modality == 'raw_audio':
|
72 |
+
self.frontend_nout = 1
|
73 |
+
self.backend_out = 512
|
74 |
+
self.trunk = ResNet1D(BasicBlock1D, [2, 2, 2, 2], relu_type=relu_type)
|
75 |
+
elif self.modality == 'video':
|
76 |
+
if self.backbone_type == 'resnet':
|
77 |
+
self.frontend_nout = 64
|
78 |
+
self.backend_out = 512
|
79 |
+
self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
|
80 |
+
elif self.backbone_type == 'shufflenet':
|
81 |
+
assert width_mult in [0.5, 1.0, 1.5, 2.0], "Width multiplier not correct"
|
82 |
+
shufflenet = ShuffleNetV2( input_size=96, width_mult=width_mult)
|
83 |
+
self.trunk = nn.Sequential( shufflenet.features, shufflenet.conv_last, shufflenet.globalpool)
|
84 |
+
self.frontend_nout = 24
|
85 |
+
self.backend_out = 1024 if width_mult != 2.0 else 2048
|
86 |
+
self.stage_out_channels = shufflenet.stage_out_channels[-1]
|
87 |
+
|
88 |
+
frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU()
|
89 |
+
self.frontend3D = nn.Sequential(
|
90 |
+
nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
|
91 |
+
nn.BatchNorm3d(self.frontend_nout),
|
92 |
+
frontend_relu,
|
93 |
+
nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
|
94 |
+
else:
|
95 |
+
raise NotImplementedError
|
96 |
+
|
97 |
+
tcn_class = TCN if len(tcn_options['kernel_size']) == 1 else MultiscaleMultibranchTCN
|
98 |
+
self.tcn = tcn_class( input_size=self.backend_out,
|
99 |
+
num_channels=[hidden_dim*len(tcn_options['kernel_size'])*tcn_options['width_mult']]*tcn_options['num_layers'],
|
100 |
+
num_classes=num_classes,
|
101 |
+
tcn_options=tcn_options,
|
102 |
+
dropout=tcn_options['dropout'],
|
103 |
+
relu_type=relu_type,
|
104 |
+
dwpw=tcn_options['dwpw'],
|
105 |
+
)
|
106 |
+
# -- initialize
|
107 |
+
self._initialize_weights_randomly()
|
108 |
+
|
109 |
+
|
110 |
+
def forward(self, x, lengths):
|
111 |
+
if self.modality == 'video':
|
112 |
+
B, C, T, H, W = x.size()
|
113 |
+
x = self.frontend3D(x)
|
114 |
+
Tnew = x.shape[2] # output should be B x C2 x Tnew x H x W
|
115 |
+
x = threeD_to_2D_tensor( x )
|
116 |
+
x = self.trunk(x)
|
117 |
+
if self.backbone_type == 'shufflenet':
|
118 |
+
x = x.view(-1, self.stage_out_channels)
|
119 |
+
x = x.view(B, Tnew, x.size(1))
|
120 |
+
elif self.modality == 'raw_audio':
|
121 |
+
B, C, T = x.size()
|
122 |
+
x = self.trunk(x)
|
123 |
+
x = x.transpose(1, 2)
|
124 |
+
lengths = [_//640 for _ in lengths]
|
125 |
+
|
126 |
+
return x if self.extract_feats else self.tcn(x, lengths, B)
|
127 |
+
|
128 |
+
|
129 |
+
def _initialize_weights_randomly(self):
|
130 |
+
|
131 |
+
use_sqrt = True
|
132 |
+
|
133 |
+
if use_sqrt:
|
134 |
+
def f(n):
|
135 |
+
return math.sqrt( 2.0/float(n) )
|
136 |
+
else:
|
137 |
+
def f(n):
|
138 |
+
return 2.0/float(n)
|
139 |
+
|
140 |
+
for m in self.modules():
|
141 |
+
if isinstance(m, nn.Conv3d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
|
142 |
+
n = np.prod( m.kernel_size ) * m.out_channels
|
143 |
+
m.weight.data.normal_(0, f(n))
|
144 |
+
if m.bias is not None:
|
145 |
+
m.bias.data.zero_()
|
146 |
+
|
147 |
+
elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
|
148 |
+
m.weight.data.fill_(1)
|
149 |
+
m.bias.data.zero_()
|
150 |
+
|
151 |
+
elif isinstance(m, nn.Linear):
|
152 |
+
n = float(m.weight.data[0].nelement())
|
153 |
+
m.weight.data = m.weight.data.normal_(0, f(n))
|
lipreading/models/resnet.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import math
|
3 |
+
import torch.nn as nn
|
4 |
+
import pdb # 파이썬 디버거
|
5 |
+
|
6 |
+
|
7 |
+
# Conv2D (3,3)
|
8 |
+
def conv3x3(in_planes, out_planes, stride=1):
|
9 |
+
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
10 |
+
padding=1, bias=False)
|
11 |
+
|
12 |
+
|
13 |
+
# Conv2D (1,1) + BatchNorm2D
|
14 |
+
def downsample_basic_block( inplanes, outplanes, stride ):
|
15 |
+
return nn.Sequential(
|
16 |
+
nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
|
17 |
+
nn.BatchNorm2d(outplanes),
|
18 |
+
)
|
19 |
+
|
20 |
+
# AvgPool2D + Conv2D (1,1) + BatchNorm2D
|
21 |
+
def downsample_basic_block_v2( inplanes, outplanes, stride ):
|
22 |
+
return nn.Sequential(
|
23 |
+
nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
|
24 |
+
nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
|
25 |
+
nn.BatchNorm2d(outplanes),
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# 기본 블럭 2D
|
31 |
+
class BasicBlock(nn.Module):
|
32 |
+
expansion = 1
|
33 |
+
|
34 |
+
def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
|
35 |
+
super(BasicBlock, self).__init__()
|
36 |
+
|
37 |
+
# relu_type 변수 값이 'relu','prelu' 인지 확인, 아니면 AssertionError 메시지를 띄움
|
38 |
+
assert relu_type in ['relu','prelu'] # 원하는 조건의 변수값을 보증하기 위해 사용
|
39 |
+
|
40 |
+
self.conv1 = conv3x3(inplanes, planes, stride) # Conv2D (3,3)
|
41 |
+
self.bn1 = nn.BatchNorm2d(planes) # BatchNorm2D
|
42 |
+
|
43 |
+
# type of ReLU is an input option
|
44 |
+
if relu_type == 'relu': # ReLU
|
45 |
+
self.relu1 = nn.ReLU(inplace=True)
|
46 |
+
self.relu2 = nn.ReLU(inplace=True)
|
47 |
+
elif relu_type == 'prelu': # PReLU
|
48 |
+
self.relu1 = nn.PReLU(num_parameters=planes)
|
49 |
+
self.relu2 = nn.PReLU(num_parameters=planes)
|
50 |
+
else:
|
51 |
+
raise Exception('relu type not implemented') # 에러 발생시키기
|
52 |
+
# --------
|
53 |
+
|
54 |
+
self.conv2 = conv3x3(planes, planes) # Conv2D (3,3)
|
55 |
+
self.bn2 = nn.BatchNorm2d(planes) # BatchNorm2D
|
56 |
+
|
57 |
+
self.downsample = downsample
|
58 |
+
self.stride = stride
|
59 |
+
|
60 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
61 |
+
def forward(self, x):
|
62 |
+
residual = x
|
63 |
+
out = self.conv1(x)
|
64 |
+
out = self.bn1(out)
|
65 |
+
out = self.relu1(out)
|
66 |
+
out = self.conv2(out)
|
67 |
+
out = self.bn2(out)
|
68 |
+
if self.downsample is not None:
|
69 |
+
residual = self.downsample(x)
|
70 |
+
|
71 |
+
out += residual
|
72 |
+
out = self.relu2(out)
|
73 |
+
|
74 |
+
return out
|
75 |
+
|
76 |
+
|
77 |
+
# 레즈넷 2D
|
78 |
+
class ResNet(nn.Module):
|
79 |
+
|
80 |
+
def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False):
|
81 |
+
self.inplanes = 64
|
82 |
+
self.relu_type = relu_type
|
83 |
+
self.gamma_zero = gamma_zero
|
84 |
+
self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block # AvgPool2D 적용하면 v2 아니면 v1
|
85 |
+
|
86 |
+
super(ResNet, self).__init__()
|
87 |
+
self.layer1 = self._make_layer(block, 64, layers[0])
|
88 |
+
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
|
89 |
+
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
|
90 |
+
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
|
91 |
+
self.avgpool = nn.AdaptiveAvgPool2d(1)
|
92 |
+
|
93 |
+
# default init
|
94 |
+
for m in self.modules():
|
95 |
+
if isinstance(m, nn.Conv2d): # Conv2D 인스턴스인가
|
96 |
+
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
97 |
+
m.weight.data.normal_(0, math.sqrt(2. / n))
|
98 |
+
elif isinstance(m, nn.BatchNorm2d): # BatchNrom2D 인스턴스인가
|
99 |
+
m.weight.data.fill_(1)
|
100 |
+
m.bias.data.zero_()
|
101 |
+
#nn.init.ones_(m.weight)
|
102 |
+
#nn.init.zeros_(m.bias)
|
103 |
+
|
104 |
+
if self.gamma_zero:
|
105 |
+
for m in self.modules():
|
106 |
+
if isinstance(m, BasicBlock ): # 기본 블럭 인스턴스인가
|
107 |
+
m.bn2.weight.data.zero_()
|
108 |
+
|
109 |
+
# 레이어 생성
|
110 |
+
def _make_layer(self, block, planes, blocks, stride=1):
|
111 |
+
|
112 |
+
|
113 |
+
downsample = None
|
114 |
+
if stride != 1 or self.inplanes != planes * block.expansion:
|
115 |
+
downsample = self.downsample_block( inplanes = self.inplanes,
|
116 |
+
outplanes = planes * block.expansion,
|
117 |
+
stride = stride ) # (AvgPool2D) + Conv2D (1,1) + BatchNorm2D
|
118 |
+
|
119 |
+
layers = []
|
120 |
+
layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
|
121 |
+
self.inplanes = planes * block.expansion
|
122 |
+
for i in range(1, blocks):
|
123 |
+
layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
|
124 |
+
|
125 |
+
return nn.Sequential(*layers) # 설정한 레이어 반환
|
126 |
+
|
127 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
128 |
+
def forward(self, x):
|
129 |
+
x = self.layer1(x)
|
130 |
+
x = self.layer2(x)
|
131 |
+
x = self.layer3(x)
|
132 |
+
x = self.layer4(x)
|
133 |
+
x = self.avgpool(x)
|
134 |
+
x = x.view(x.size(0), -1)
|
135 |
+
return x
|
lipreading/models/resnet1D.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import math
|
3 |
+
import torch.nn as nn
|
4 |
+
import pdb # 파이썬 디버거
|
5 |
+
|
6 |
+
|
7 |
+
# Conv1D (3,3)
|
8 |
+
def conv3x3(in_planes, out_planes, stride=1):
|
9 |
+
return nn.Conv1d(in_planes, out_planes, kernel_size=3, stride=stride,
|
10 |
+
padding=1, bias=False)
|
11 |
+
|
12 |
+
|
13 |
+
# Conv1D (1,1) + BatchNorm1D
|
14 |
+
def downsample_basic_block( inplanes, outplanes, stride ):
|
15 |
+
return nn.Sequential(
|
16 |
+
nn.Conv1d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
|
17 |
+
nn.BatchNorm1d(outplanes),
|
18 |
+
)
|
19 |
+
|
20 |
+
# AvgPool1D + Conv1D (1,1) + BatchNorm1D
|
21 |
+
def downsample_basic_block_v2( inplanes, outplanes, stride ):
|
22 |
+
return nn.Sequential(
|
23 |
+
nn.AvgPool1d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
|
24 |
+
nn.Conv1d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
|
25 |
+
nn.BatchNorm1d(outplanes),
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# 기본 블럭 1D
|
31 |
+
class BasicBlock1D(nn.Module):
|
32 |
+
expansion = 1
|
33 |
+
|
34 |
+
def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
|
35 |
+
super(BasicBlock1D, self).__init__()
|
36 |
+
|
37 |
+
# relu_type 변수 값이 'relu','prelu' 인지 확인, 아니면 AssertionError 메시지를 띄움
|
38 |
+
assert relu_type in ['relu','prelu'] # 원하는 조건의 변수값을 보증하기 위해 사용
|
39 |
+
|
40 |
+
self.conv1 = conv3x3(inplanes, planes, stride) # Conv1D (3,3)
|
41 |
+
self.bn1 = nn.BatchNorm1d(planes) # BatchNorm1D
|
42 |
+
|
43 |
+
# type of ReLU is an input option
|
44 |
+
if relu_type == 'relu': # ReLU
|
45 |
+
self.relu1 = nn.ReLU(inplace=True)
|
46 |
+
self.relu2 = nn.ReLU(inplace=True)
|
47 |
+
elif relu_type == 'prelu': # PReLU
|
48 |
+
self.relu1 = nn.PReLU(num_parameters=planes)
|
49 |
+
self.relu2 = nn.PReLU(num_parameters=planes)
|
50 |
+
else:
|
51 |
+
raise Exception('relu type not implemented') # 에러 발생시키기
|
52 |
+
# --------
|
53 |
+
|
54 |
+
self.conv2 = conv3x3(planes, planes) # Conv1D (3,3)
|
55 |
+
self.bn2 = nn.BatchNorm1d(planes) # BatchNorm1D
|
56 |
+
|
57 |
+
self.downsample = downsample
|
58 |
+
self.stride = stride
|
59 |
+
|
60 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
61 |
+
def forward(self, x):
|
62 |
+
residual = x
|
63 |
+
out = self.conv1(x)
|
64 |
+
out = self.bn1(out)
|
65 |
+
out = self.relu1(out)
|
66 |
+
out = self.conv2(out)
|
67 |
+
out = self.bn2(out)
|
68 |
+
if self.downsample is not None:
|
69 |
+
residual = self.downsample(x)
|
70 |
+
|
71 |
+
out += residual
|
72 |
+
out = self.relu2(out)
|
73 |
+
|
74 |
+
return out
|
75 |
+
|
76 |
+
|
77 |
+
# 레즈넷1D
|
78 |
+
class ResNet1D(nn.Module):
|
79 |
+
|
80 |
+
def __init__(self, block, layers, relu_type = 'relu'):
|
81 |
+
super(ResNet1D, self).__init__()
|
82 |
+
self.inplanes = 64
|
83 |
+
self.relu_type = relu_type
|
84 |
+
self.downsample_block = downsample_basic_block
|
85 |
+
|
86 |
+
self.conv1 = nn.Conv1d(1, self.inplanes, kernel_size=80, stride=4, padding=38,
|
87 |
+
bias=False) # Conv1D
|
88 |
+
self.bn1 = nn.BatchNorm1d(self.inplanes) # BatchNorm1D
|
89 |
+
# type of ReLU is an input option
|
90 |
+
if relu_type == 'relu': # ReLU
|
91 |
+
self.relu = nn.ReLU(inplace=True)
|
92 |
+
elif relu_type == 'prelu': # PReLU
|
93 |
+
self.relu = nn.PReLU(num_parameters=self.inplanes)
|
94 |
+
self.layer1 = self._make_layer(block, 64, layers[0])
|
95 |
+
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
|
96 |
+
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
|
97 |
+
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
|
98 |
+
# For LRW, we downsample the sampling rate to 25fps
|
99 |
+
self.avgpool = nn.AvgPool1d(kernel_size=21, padding=1)
|
100 |
+
'''
|
101 |
+
# The following pooling setting is the general configuration # 일반 구성 AvgPool1D
|
102 |
+
self.avgpool = nn.AvgPool1d(kernel_size=20, stride=20)
|
103 |
+
'''
|
104 |
+
|
105 |
+
# default init
|
106 |
+
for m in self.modules():
|
107 |
+
if isinstance(m, nn.Conv1d): # Conv1D 인스턴스인가
|
108 |
+
n = m.kernel_size[0] * m.out_channels
|
109 |
+
m.weight.data.normal_(0, math.sqrt(2. / n))
|
110 |
+
elif isinstance(m, nn.BatchNorm1d): # BatchNrom1D 인스턴스인가
|
111 |
+
m.weight.data.fill_(1)
|
112 |
+
m.bias.data.zero_()
|
113 |
+
|
114 |
+
# 레이어 생성
|
115 |
+
def _make_layer(self, block, planes, blocks, stride=1):
|
116 |
+
|
117 |
+
|
118 |
+
downsample = None
|
119 |
+
if stride != 1 or self.inplanes != planes * block.expansion:
|
120 |
+
downsample = self.downsample_block( inplanes = self.inplanes,
|
121 |
+
outplanes = planes * block.expansion,
|
122 |
+
stride = stride ) # (AvgPool1D) + Conv1D (1,1) + BatchNorm1D
|
123 |
+
|
124 |
+
layers = []
|
125 |
+
layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
|
126 |
+
self.inplanes = planes * block.expansion
|
127 |
+
for i in range(1, blocks):
|
128 |
+
layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
|
129 |
+
|
130 |
+
return nn.Sequential(*layers) # 설정한 레이어 반환
|
131 |
+
|
132 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
133 |
+
def forward(self, x):
|
134 |
+
x = self.conv1(x)
|
135 |
+
x = self.bn1(x)
|
136 |
+
x = self.relu(x)
|
137 |
+
|
138 |
+
x = self.layer1(x)
|
139 |
+
x = self.layer2(x)
|
140 |
+
x = self.layer3(x)
|
141 |
+
x = self.layer4(x)
|
142 |
+
x = self.avgpool(x)
|
143 |
+
return x
|
lipreading/models/shufflenetv2.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torch.autograd import Variable
|
5 |
+
from collections import OrderedDict
|
6 |
+
from torch.nn import init
|
7 |
+
import math
|
8 |
+
|
9 |
+
import pdb # 파이썬 디버거
|
10 |
+
|
11 |
+
|
12 |
+
# Conv2D (3,3) + BatchNorm2D + ReLU
|
13 |
+
def conv_bn(inp, oup, stride):
|
14 |
+
return nn.Sequential(
|
15 |
+
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
|
16 |
+
nn.BatchNorm2d(oup),
|
17 |
+
nn.ReLU(inplace=True)
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
# Conv2D (1,1) + BatchNorm2D + ReLU
|
22 |
+
def conv_1x1_bn(inp, oup):
|
23 |
+
return nn.Sequential(
|
24 |
+
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
|
25 |
+
nn.BatchNorm2d(oup),
|
26 |
+
nn.ReLU(inplace=True)
|
27 |
+
)
|
28 |
+
|
29 |
+
|
30 |
+
# reshape -> flatten
|
31 |
+
def channel_shuffle(x, groups):
|
32 |
+
batchsize, num_channels, height, width = x.data.size() # data 정보
|
33 |
+
|
34 |
+
channels_per_group = num_channels // groups # 그룹당 채널 계산
|
35 |
+
|
36 |
+
# reshape
|
37 |
+
x = x.view(batchsize, groups, # reshape 적용된 모양의 tensor 반환 # 원본 data 공유
|
38 |
+
channels_per_group, height, width)
|
39 |
+
|
40 |
+
x = torch.transpose(x, 1, 2).contiguous() # transpose(): 2개의 차원 맞교환 # contiguous(): 원본과 다른 새로운 주소로 할당
|
41 |
+
|
42 |
+
# flatten => [batchsize, height * width]
|
43 |
+
x = x.view(batchsize, -1, height, width) # reshape 적용된 모양의 tensor 반환 # 원본 data 공유
|
44 |
+
|
45 |
+
return x
|
46 |
+
|
47 |
+
|
48 |
+
# Inverted Residual - 관련 모델: MobileNetV2
|
49 |
+
class InvertedResidual(nn.Module):
|
50 |
+
def __init__(self, inp, oup, stride, benchmodel):
|
51 |
+
super(InvertedResidual, self).__init__()
|
52 |
+
self.benchmodel = benchmodel
|
53 |
+
self.stride = stride
|
54 |
+
|
55 |
+
# stride 가 [1,2] 인지 확인, 아니면 AssertionError 메시지를 띄움
|
56 |
+
assert stride in [1, 2] # 원하는 조건의 변수값을 보증하기 위해 사용
|
57 |
+
|
58 |
+
oup_inc = oup//2
|
59 |
+
|
60 |
+
if self.benchmodel == 1:
|
61 |
+
#assert inp == oup_inc
|
62 |
+
self.banch2 = nn.Sequential(
|
63 |
+
# pw
|
64 |
+
nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
|
65 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
66 |
+
nn.ReLU(inplace=True), # ReLU
|
67 |
+
# dw
|
68 |
+
nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), # Conv2D (3,3)
|
69 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
70 |
+
# pw-linear
|
71 |
+
nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
|
72 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
73 |
+
nn.ReLU(inplace=True), # ReLU
|
74 |
+
)
|
75 |
+
else:
|
76 |
+
self.banch1 = nn.Sequential(
|
77 |
+
# dw
|
78 |
+
nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), # Conv2D (3,3)
|
79 |
+
nn.BatchNorm2d(inp), # BatchNorm2D
|
80 |
+
# pw-linear
|
81 |
+
nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
|
82 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
83 |
+
nn.ReLU(inplace=True), # ReLU
|
84 |
+
)
|
85 |
+
|
86 |
+
self.banch2 = nn.Sequential(
|
87 |
+
# pw
|
88 |
+
nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
|
89 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
90 |
+
nn.ReLU(inplace=True), # ReLU
|
91 |
+
# dw
|
92 |
+
nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), # Conv2D (3,3)
|
93 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
94 |
+
# pw-linear
|
95 |
+
nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False), # Conv2D (1,1)
|
96 |
+
nn.BatchNorm2d(oup_inc), # BatchNorm2D
|
97 |
+
nn.ReLU(inplace=True), # ReLU
|
98 |
+
)
|
99 |
+
|
100 |
+
@staticmethod
|
101 |
+
def _concat(x, out):
|
102 |
+
# concatenate along channel axis
|
103 |
+
return torch.cat((x, out), 1) # Tensor list를 한번에 tensor로 만들기
|
104 |
+
|
105 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
106 |
+
def forward(self, x):
|
107 |
+
if 1==self.benchmodel:
|
108 |
+
x1 = x[:, :(x.shape[1]//2), :, :]
|
109 |
+
x2 = x[:, (x.shape[1]//2):, :, :]
|
110 |
+
out = self._concat(x1, self.banch2(x2))
|
111 |
+
elif 2==self.benchmodel:
|
112 |
+
out = self._concat(self.banch1(x), self.banch2(x))
|
113 |
+
|
114 |
+
return channel_shuffle(out, 2) # reshape -> flatten
|
115 |
+
|
116 |
+
|
117 |
+
# 셔플넷 V2
|
118 |
+
class ShuffleNetV2(nn.Module):
|
119 |
+
def __init__(self, n_class=1000, input_size=224, width_mult=2.):
|
120 |
+
super(ShuffleNetV2, self).__init__()
|
121 |
+
|
122 |
+
# 인풋사이즈 % 32 == 0 인지 확인, 아니면 AssertionError 메시지를 띄움
|
123 |
+
assert input_size % 32 == 0, "Input size needs to be divisible by 32" # 원하는 조건의 변수값을 보증하기 위해 사용
|
124 |
+
|
125 |
+
self.stage_repeats = [4, 8, 4]
|
126 |
+
# index 0 is invalid and should never be called.
|
127 |
+
# only used for indexing convenience.
|
128 |
+
if width_mult == 0.5:
|
129 |
+
self.stage_out_channels = [-1, 24, 48, 96, 192, 1024]
|
130 |
+
elif width_mult == 1.0:
|
131 |
+
self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
|
132 |
+
elif width_mult == 1.5:
|
133 |
+
self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
|
134 |
+
elif width_mult == 2.0:
|
135 |
+
self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
|
136 |
+
else:
|
137 |
+
raise ValueError( # 에러 발생시키기
|
138 |
+
"""Width multiplier should be in [0.5, 1.0, 1.5, 2.0]. Current value: {}""".format(width_mult))
|
139 |
+
|
140 |
+
# building first layer
|
141 |
+
input_channel = self.stage_out_channels[1]
|
142 |
+
self.conv1 = conv_bn(3, input_channel, 2) # Conv2D (3,3) + BatchNorm2D + ReLU
|
143 |
+
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # MaxPool2D
|
144 |
+
|
145 |
+
self.features = []
|
146 |
+
# building inverted residual blocks
|
147 |
+
for idxstage in range(len(self.stage_repeats)):
|
148 |
+
numrepeat = self.stage_repeats[idxstage]
|
149 |
+
output_channel = self.stage_out_channels[idxstage+2]
|
150 |
+
for i in range(numrepeat):
|
151 |
+
if i == 0:
|
152 |
+
#inp, oup, stride, benchmodel):
|
153 |
+
self.features.append(InvertedResidual(input_channel, output_channel, 2, 2))
|
154 |
+
else:
|
155 |
+
self.features.append(InvertedResidual(input_channel, output_channel, 1, 1))
|
156 |
+
input_channel = output_channel
|
157 |
+
|
158 |
+
|
159 |
+
# make it nn.Sequential
|
160 |
+
self.features = nn.Sequential(*self.features)
|
161 |
+
|
162 |
+
# building last several layers
|
163 |
+
self.conv_last = conv_1x1_bn(input_channel, self.stage_out_channels[-1]) # Conv2D (1,1) + BatchNorm2D + ReLU
|
164 |
+
self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32))) # AvgPool2D
|
165 |
+
|
166 |
+
# building classifier # 선형 회귀 모델
|
167 |
+
self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))
|
168 |
+
|
169 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
170 |
+
def forward(self, x):
|
171 |
+
x = self.conv1(x)
|
172 |
+
x = self.maxpool(x)
|
173 |
+
x = self.features(x)
|
174 |
+
x = self.conv_last(x)
|
175 |
+
x = self.globalpool(x)
|
176 |
+
x = x.view(-1, self.stage_out_channels[-1])
|
177 |
+
x = self.classifier(x)
|
178 |
+
return x
|
lipreading/models/tcn.py
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from torch.nn.utils import weight_norm
|
4 |
+
import pdb
|
5 |
+
|
6 |
+
|
7 |
+
"""Implements Temporal Convolutional Network (TCN)
|
8 |
+
|
9 |
+
__https://arxiv.org/pdf/1803.01271.pdf
|
10 |
+
"""
|
11 |
+
|
12 |
+
# Casual Conv1D
|
13 |
+
class Chomp1d(nn.Module):
|
14 |
+
def __init__(self, chomp_size, symm_chomp):
|
15 |
+
super(Chomp1d, self).__init__()
|
16 |
+
self.chomp_size = chomp_size
|
17 |
+
self.symm_chomp = symm_chomp
|
18 |
+
if self.symm_chomp:
|
19 |
+
assert self.chomp_size % 2 == 0, "If symmetric chomp, chomp size needs to be even"
|
20 |
+
|
21 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
22 |
+
def forward(self, x):
|
23 |
+
if self.chomp_size == 0:
|
24 |
+
return x
|
25 |
+
if self.symm_chomp:
|
26 |
+
return x[:, :, self.chomp_size//2:-self.chomp_size//2].contiguous()
|
27 |
+
else:
|
28 |
+
return x[:, :, :-self.chomp_size].contiguous()
|
29 |
+
|
30 |
+
|
31 |
+
# Conv1D + BatchNorm1D + Casual Conv1D + ReLU
|
32 |
+
class ConvBatchChompRelu(nn.Module):
|
33 |
+
def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, relu_type, dwpw=False):
|
34 |
+
super(ConvBatchChompRelu, self).__init__()
|
35 |
+
self.dwpw = dwpw
|
36 |
+
if dwpw:
|
37 |
+
self.conv = nn.Sequential(
|
38 |
+
# -- dw
|
39 |
+
nn.Conv1d( n_inputs, n_inputs, kernel_size, stride=stride, # Conv1D
|
40 |
+
padding=padding, dilation=dilation, groups=n_inputs, bias=False),
|
41 |
+
nn.BatchNorm1d(n_inputs), # BatchNorm1D
|
42 |
+
Chomp1d(padding, True), # Casual Conv1D
|
43 |
+
nn.PReLU(num_parameters=n_inputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
|
44 |
+
# -- pw
|
45 |
+
nn.Conv1d( n_inputs, n_outputs, 1, 1, 0, bias=False), # Conv1D
|
46 |
+
nn.BatchNorm1d(n_outputs), # BatchNorm1D
|
47 |
+
nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True) # PReLU or ReLU
|
48 |
+
)
|
49 |
+
else:
|
50 |
+
self.conv = nn.Conv1d(n_inputs, n_outputs, kernel_size, # Conv1D
|
51 |
+
stride=stride, padding=padding, dilation=dilation)
|
52 |
+
self.batchnorm = nn.BatchNorm1d(n_outputs) # BatchNorm1D
|
53 |
+
self.chomp = Chomp1d(padding,True) # Casual Conv1D
|
54 |
+
self.non_lin = nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU() # PReLU or ReLU
|
55 |
+
|
56 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
57 |
+
def forward(self, x):
|
58 |
+
if self.dwpw:
|
59 |
+
return self.conv(x)
|
60 |
+
else:
|
61 |
+
out = self.conv( x )
|
62 |
+
out = self.batchnorm( out )
|
63 |
+
out = self.chomp( out )
|
64 |
+
return self.non_lin( out )
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
# --------- MULTI-BRANCH VERSION ---------------
|
69 |
+
class MultibranchTemporalBlock(nn.Module):
|
70 |
+
def __init__(self, n_inputs, n_outputs, kernel_sizes, stride, dilation, padding, dropout=0.2,
|
71 |
+
relu_type = 'relu', dwpw=False):
|
72 |
+
super(MultibranchTemporalBlock, self).__init__()
|
73 |
+
|
74 |
+
self.kernel_sizes = kernel_sizes
|
75 |
+
self.num_kernels = len( kernel_sizes )
|
76 |
+
self.n_outputs_branch = n_outputs // self.num_kernels
|
77 |
+
assert n_outputs % self.num_kernels == 0, "Number of output channels needs to be divisible by number of kernels"
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
for k_idx,k in enumerate( self.kernel_sizes ):
|
82 |
+
cbcr = ConvBatchChompRelu( n_inputs, self.n_outputs_branch, k, stride, dilation, padding[k_idx], relu_type, dwpw=dwpw) # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
|
83 |
+
setattr( self,'cbcr0_{}'.format(k_idx), cbcr ) # object 에 존재하는 속성의 값을 바꾸거나 새로운 속성을 생성하여 값을 부여함
|
84 |
+
self.dropout0 = nn.Dropout(dropout) # Dropout
|
85 |
+
|
86 |
+
for k_idx,k in enumerate( self.kernel_sizes ):
|
87 |
+
cbcr = ConvBatchChompRelu( n_outputs, self.n_outputs_branch, k, stride, dilation, padding[k_idx], relu_type, dwpw=dwpw) # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
|
88 |
+
setattr( self,'cbcr1_{}'.format(k_idx), cbcr ) # object 에 존재하는 속성의 값을 바꾸거나 새로운 속성을 생성하여 값을 부여함
|
89 |
+
self.dropout1 = nn.Dropout(dropout) # Dropout
|
90 |
+
|
91 |
+
# downsample?
|
92 |
+
self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if (n_inputs//self.num_kernels) != n_outputs else None # Conv1D or None
|
93 |
+
|
94 |
+
# final relu
|
95 |
+
if relu_type == 'relu':
|
96 |
+
self.relu_final = nn.ReLU() # ReLU
|
97 |
+
elif relu_type == 'prelu':
|
98 |
+
self.relu_final = nn.PReLU(num_parameters=n_outputs) # PReLU
|
99 |
+
|
100 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
101 |
+
def forward(self, x):
|
102 |
+
|
103 |
+
# first multi-branch set of convolutions
|
104 |
+
outputs = []
|
105 |
+
for k_idx in range( self.num_kernels ):
|
106 |
+
branch_convs = getattr(self,'cbcr0_{}'.format(k_idx))
|
107 |
+
outputs.append( branch_convs(x) )
|
108 |
+
out0 = torch.cat(outputs, 1)
|
109 |
+
out0 = self.dropout0( out0 )
|
110 |
+
|
111 |
+
# second multi-branch set of convolutions
|
112 |
+
outputs = []
|
113 |
+
for k_idx in range( self.num_kernels ):
|
114 |
+
branch_convs = getattr(self,'cbcr1_{}'.format(k_idx))
|
115 |
+
outputs.append( branch_convs(out0) )
|
116 |
+
out1 = torch.cat(outputs, 1)
|
117 |
+
out1 = self.dropout1( out1 )
|
118 |
+
|
119 |
+
# downsample?
|
120 |
+
res = x if self.downsample is None else self.downsample(x)
|
121 |
+
|
122 |
+
return self.relu_final(out1 + res)
|
123 |
+
|
124 |
+
class MultibranchTemporalConvNet(nn.Module):
|
125 |
+
def __init__(self, num_inputs, num_channels, tcn_options, dropout=0.2, relu_type='relu', dwpw=False):
|
126 |
+
super(MultibranchTemporalConvNet, self).__init__()
|
127 |
+
|
128 |
+
self.ksizes = tcn_options['kernel_size']
|
129 |
+
|
130 |
+
layers = []
|
131 |
+
num_levels = len(num_channels)
|
132 |
+
for i in range(num_levels):
|
133 |
+
dilation_size = 2 ** i
|
134 |
+
in_channels = num_inputs if i == 0 else num_channels[i-1]
|
135 |
+
out_channels = num_channels[i]
|
136 |
+
|
137 |
+
|
138 |
+
padding = [ (s-1)*dilation_size for s in self.ksizes]
|
139 |
+
layers.append( MultibranchTemporalBlock( in_channels, out_channels, self.ksizes,
|
140 |
+
stride=1, dilation=dilation_size, padding = padding, dropout=dropout, relu_type = relu_type,
|
141 |
+
dwpw=dwpw) )
|
142 |
+
|
143 |
+
self.network = nn.Sequential(*layers) # 설정한 레이어 반환
|
144 |
+
|
145 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
146 |
+
def forward(self, x):
|
147 |
+
return self.network(x)
|
148 |
+
# --------------------------------
|
149 |
+
|
150 |
+
|
151 |
+
# --------------- STANDARD VERSION (SINGLE BRANCH) ------------------------
|
152 |
+
class TemporalBlock(nn.Module):
|
153 |
+
def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2,
|
154 |
+
symm_chomp = False, no_padding = False, relu_type = 'relu', dwpw=False):
|
155 |
+
super(TemporalBlock, self).__init__()
|
156 |
+
|
157 |
+
self.no_padding = no_padding
|
158 |
+
if self.no_padding:
|
159 |
+
downsample_chomp_size = 2*padding-4
|
160 |
+
padding = 1 # hack-ish thing so that we can use 3 layers
|
161 |
+
|
162 |
+
if dwpw:
|
163 |
+
self.net = nn.Sequential(
|
164 |
+
# -- first conv set within block
|
165 |
+
# -- dw
|
166 |
+
nn.Conv1d( n_inputs, n_inputs, kernel_size, stride=stride, # Conv1D
|
167 |
+
padding=padding, dilation=dilation, groups=n_inputs, bias=False),
|
168 |
+
nn.BatchNorm1d(n_inputs), # BatchNorm1D
|
169 |
+
Chomp1d(padding, True), # Casual Conv1D
|
170 |
+
nn.PReLU(num_parameters=n_inputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
|
171 |
+
# -- pw
|
172 |
+
nn.Conv1d( n_inputs, n_outputs, 1, 1, 0, bias=False), # Conv1D (1,1)
|
173 |
+
nn.BatchNorm1d(n_outputs), # BatchNorm1D
|
174 |
+
nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
|
175 |
+
nn.Dropout(dropout), # Dropout
|
176 |
+
# -- second conv set within block
|
177 |
+
# -- dw
|
178 |
+
nn.Conv1d( n_outputs, n_outputs, kernel_size, stride=stride, # Conv1D
|
179 |
+
padding=padding, dilation=dilation, groups=n_outputs, bias=False),
|
180 |
+
nn.BatchNorm1d(n_outputs), # BatchNorm1D
|
181 |
+
Chomp1d(padding, True), # Casual Conv1D
|
182 |
+
nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
|
183 |
+
# -- pw
|
184 |
+
nn.Conv1d( n_outputs, n_outputs, 1, 1, 0, bias=False), # Conv1D
|
185 |
+
nn.BatchNorm1d(n_outputs), # BatchNorm1D
|
186 |
+
nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True), # PReLU or ReLU
|
187 |
+
nn.Dropout(dropout), # Dropout
|
188 |
+
)
|
189 |
+
else:
|
190 |
+
self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size, # Conv1D
|
191 |
+
stride=stride, padding=padding, dilation=dilation)
|
192 |
+
self.batchnorm1 = nn.BatchNorm1d(n_outputs) # BatchNorm1D
|
193 |
+
self.chomp1 = Chomp1d(padding,symm_chomp) if not self.no_padding else None # Casual Conv1D or None
|
194 |
+
if relu_type == 'relu':
|
195 |
+
self.relu1 = nn.ReLU() # ReLU
|
196 |
+
elif relu_type == 'prelu':
|
197 |
+
self.relu1 = nn.PReLU(num_parameters=n_outputs) # PReLU
|
198 |
+
self.dropout1 = nn.Dropout(dropout) # Dropout
|
199 |
+
|
200 |
+
self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size, # Conv1D
|
201 |
+
stride=stride, padding=padding, dilation=dilation)
|
202 |
+
self.batchnorm2 = nn.BatchNorm1d(n_outputs) # BatchNorm1D
|
203 |
+
self.chomp2 = Chomp1d(padding,symm_chomp) if not self.no_padding else None # Casual Conv1D or None
|
204 |
+
if relu_type == 'relu':
|
205 |
+
self.relu2 = nn.ReLU() # ReLU
|
206 |
+
elif relu_type == 'prelu':
|
207 |
+
self.relu2 = nn.PReLU(num_parameters=n_outputs) # PReLU
|
208 |
+
self.dropout2 = nn.Dropout(dropout) # Dropout
|
209 |
+
|
210 |
+
|
211 |
+
if self.no_padding:
|
212 |
+
self.net = nn.Sequential(self.conv1, self.batchnorm1, self.relu1, self.dropout1,
|
213 |
+
self.conv2, self.batchnorm2, self.relu2, self.dropout2)
|
214 |
+
else:
|
215 |
+
self.net = nn.Sequential(self.conv1, self.batchnorm1, self.chomp1, self.relu1, self.dropout1,
|
216 |
+
self.conv2, self.batchnorm2, self.chomp2, self.relu2, self.dropout2)
|
217 |
+
|
218 |
+
self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None # Conv1D or None
|
219 |
+
if self.no_padding:
|
220 |
+
self.downsample_chomp = Chomp1d(downsample_chomp_size,True) # Casual Conv1D
|
221 |
+
if relu_type == 'relu':
|
222 |
+
self.relu = nn.ReLU() # ReLU
|
223 |
+
elif relu_type == 'prelu':
|
224 |
+
self.relu = nn.PReLU(num_parameters=n_outputs) # PReLU
|
225 |
+
|
226 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
227 |
+
def forward(self, x):
|
228 |
+
out = self.net(x)
|
229 |
+
if self.no_padding:
|
230 |
+
x = self.downsample_chomp(x)
|
231 |
+
res = x if self.downsample is None else self.downsample(x)
|
232 |
+
return self.relu(out + res)
|
233 |
+
|
234 |
+
|
235 |
+
# TCN 모델
|
236 |
+
class TemporalConvNet(nn.Module):
|
237 |
+
def __init__(self, num_inputs, num_channels, tcn_options, dropout=0.2, relu_type='relu', dwpw=False):
|
238 |
+
super(TemporalConvNet, self).__init__()
|
239 |
+
self.ksize = tcn_options['kernel_size'][0] if isinstance(tcn_options['kernel_size'], list) else tcn_options['kernel_size']
|
240 |
+
layers = []
|
241 |
+
num_levels = len(num_channels)
|
242 |
+
for i in range(num_levels):
|
243 |
+
dilation_size = 2 ** i
|
244 |
+
in_channels = num_inputs if i == 0 else num_channels[i-1]
|
245 |
+
out_channels = num_channels[i]
|
246 |
+
layers.append( TemporalBlock(in_channels, out_channels, self.ksize, stride=1, dilation=dilation_size,
|
247 |
+
padding=(self.ksize-1) * dilation_size, dropout=dropout, symm_chomp = True,
|
248 |
+
no_padding = False, relu_type=relu_type, dwpw=dwpw) )
|
249 |
+
|
250 |
+
self.network = nn.Sequential(*layers) # 설정한 레이어 반환
|
251 |
+
|
252 |
+
# 모델이 학습데이터를 입력받아서 forward propagation 진행
|
253 |
+
def forward(self, x):
|
254 |
+
return self.network(x)
|
255 |
+
# --------------------------------
|
lipreading/optim_utils.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
import torch.optim as optim
|
4 |
+
|
5 |
+
|
6 |
+
def change_lr_on_optimizer(optimizer, lr):
|
7 |
+
for param_group in optimizer.param_groups:
|
8 |
+
param_group['lr'] = lr
|
9 |
+
|
10 |
+
|
11 |
+
class CosineScheduler:
|
12 |
+
def __init__(self, lr_ori, epochs):
|
13 |
+
self.lr_ori = lr_ori
|
14 |
+
self.epochs = epochs
|
15 |
+
|
16 |
+
def adjust_lr(self, optimizer, epoch):
|
17 |
+
reduction_ratio = 0.5 * (1 + math.cos(math.pi * epoch / self.epochs))
|
18 |
+
change_lr_on_optimizer(optimizer, self.lr_ori*reduction_ratio)
|
19 |
+
|
20 |
+
|
21 |
+
def get_optimizer(args, optim_policies):
|
22 |
+
# -- define optimizer
|
23 |
+
if args.optimizer == 'adam':
|
24 |
+
optimizer = optim.Adam(optim_policies, lr=args.lr, weight_decay=1e-4)
|
25 |
+
elif args.optimizer == 'adamw':
|
26 |
+
optimizer = optim.AdamW(optim_policies, lr=args.lr, weight_decay=1e-2)
|
27 |
+
elif args.optimizer == 'sgd':
|
28 |
+
optimizer = optim.SGD(optim_policies, lr=args.lr, weight_decay=1e-4, momentum=0.9)
|
29 |
+
else:
|
30 |
+
raise NotImplementedError
|
31 |
+
return optimizer
|
lipreading/preprocess.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
__all__ = ['Compose', 'Normalize', 'CenterCrop', 'RgbToGray', 'RandomCrop',
|
6 |
+
'HorizontalFlip', 'AddNoise', 'NormalizeUtterance']
|
7 |
+
|
8 |
+
|
9 |
+
class Compose(object):
|
10 |
+
"""Compose several preprocess together.
|
11 |
+
Args:
|
12 |
+
preprocess (list of ``Preprocess`` objects): list of preprocess to compose.
|
13 |
+
"""
|
14 |
+
# preprecess ([preprocess]) : dataloaders.py에서 사용됨
|
15 |
+
# preprocessing['train'] = Compose([
|
16 |
+
# Normalize( 0.0,255.0 ),
|
17 |
+
# RandomCrop(crop_size),
|
18 |
+
# HorizontalFlip(0.5),
|
19 |
+
# Normalize(mean, std) ])
|
20 |
+
|
21 |
+
def __init__(self, preprocess):
|
22 |
+
self.preprocess = preprocess
|
23 |
+
|
24 |
+
def __call__(self, sample):
|
25 |
+
for t in self.preprocess:
|
26 |
+
sample = t(sample)
|
27 |
+
return sample # preprocess에 담긴 각 augmentation 전처리가 sample에 담겨 반환된다.
|
28 |
+
|
29 |
+
def __repr__(self): # __repr__() : 괄호 안에 있는 것을 문자열로 반환
|
30 |
+
format_string = self.__class__.__name__ + '('
|
31 |
+
for t in self.preprocess:
|
32 |
+
format_string += '\n'
|
33 |
+
format_string += ' {0}'.format(t)
|
34 |
+
format_string += '\n)'
|
35 |
+
return format_string # 클래스명, 전처리명 등을 괄호 안에 출력
|
36 |
+
|
37 |
+
|
38 |
+
class RgbToGray(object):
|
39 |
+
"""Convert image to grayscale.
|
40 |
+
Converts a numpy.ndarray (H x W x C) in the range
|
41 |
+
[0, 255] to a numpy.ndarray of shape (H x W x C) in the range [0.0, 1.0].
|
42 |
+
"""
|
43 |
+
|
44 |
+
def __call__(self, frames):
|
45 |
+
"""
|
46 |
+
Args:
|
47 |
+
img (numpy.ndarray): Image to be converted to gray.
|
48 |
+
Returns:
|
49 |
+
numpy.ndarray: grey image
|
50 |
+
"""
|
51 |
+
frames = np.stack([cv2.cvtColor(_, cv2.COLOR_RGB2GRAY) for _ in frames], axis=0)
|
52 |
+
return frames
|
53 |
+
|
54 |
+
def __repr__(self):
|
55 |
+
return self.__class__.__name__ + '()'
|
56 |
+
|
57 |
+
|
58 |
+
class Normalize(object):
|
59 |
+
"""Normalize a ndarray image with mean and standard deviation.
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self, mean, std):
|
63 |
+
self.mean = mean
|
64 |
+
self.std = std
|
65 |
+
|
66 |
+
def __call__(self, frames):
|
67 |
+
"""
|
68 |
+
Args:
|
69 |
+
tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
|
70 |
+
Returns:
|
71 |
+
Tensor: Normalized Tensor image.
|
72 |
+
"""
|
73 |
+
frames = (frames - self.mean) / self.std # 편차를 표준 편차로 나눈 값 : z-score normalization
|
74 |
+
return frames
|
75 |
+
|
76 |
+
def __repr__(self):
|
77 |
+
return self.__class__.__name__+'(mean={0}, std={1})'.format(self.mean, self.std)
|
78 |
+
|
79 |
+
|
80 |
+
class CenterCrop(object):
|
81 |
+
"""Crop the given image at the center
|
82 |
+
"""
|
83 |
+
def __init__(self, size):
|
84 |
+
self.size = size
|
85 |
+
|
86 |
+
def __call__(self, frames):
|
87 |
+
"""
|
88 |
+
Args:
|
89 |
+
img (numpy.ndarray): Images to be cropped.
|
90 |
+
Returns:
|
91 |
+
numpy.ndarray: Cropped image.
|
92 |
+
"""
|
93 |
+
t, h, w = frames.shape
|
94 |
+
th, tw = self.size # 자르려고 지정한 높이와 넓이 사이즈
|
95 |
+
delta_w = int(round((w - tw))/2.)
|
96 |
+
delta_h = int(round((h - th))/2.)
|
97 |
+
frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
|
98 |
+
return frames # center crop된 이미지 반환 (np.array)
|
99 |
+
|
100 |
+
|
101 |
+
class RandomCrop(object):
|
102 |
+
"""Crop the given image at the center
|
103 |
+
"""
|
104 |
+
|
105 |
+
def __init__(self, size):
|
106 |
+
self.size = size
|
107 |
+
|
108 |
+
def __call__(self, frames):
|
109 |
+
"""
|
110 |
+
Args:
|
111 |
+
img (numpy.ndarray): Images to be cropped.
|
112 |
+
Returns:
|
113 |
+
numpy.ndarray: Cropped image.
|
114 |
+
"""
|
115 |
+
t, h, w = frames.shape # size: 96,96
|
116 |
+
th, tw = self.size
|
117 |
+
delta_w = random.randint(0, w-tw)
|
118 |
+
delta_h = random.randint(0, h-th)
|
119 |
+
frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
|
120 |
+
return frames # random crop된 이미지 반환 (np.array)
|
121 |
+
|
122 |
+
def __repr__(self):
|
123 |
+
return self.__class__.__name__ + '(size={0})'.format(self.size) # random crop된 사이즈를 반환
|
124 |
+
|
125 |
+
|
126 |
+
class HorizontalFlip(object): # HorizontalFlip(비율값 입)
|
127 |
+
"""Flip image horizontally.
|
128 |
+
"""
|
129 |
+
|
130 |
+
def __init__(self, flip_ratio):
|
131 |
+
self.flip_ratio = flip_ratio
|
132 |
+
|
133 |
+
def __call__(self, frames):
|
134 |
+
"""
|
135 |
+
Args:
|
136 |
+
img (numpy.ndarray): Images to be flipped with a probability flip_ratio
|
137 |
+
Returns:
|
138 |
+
numpy.ndarray: Cropped image.
|
139 |
+
"""
|
140 |
+
t, h, w = frames.shape
|
141 |
+
if random.random() < self.flip_ratio:
|
142 |
+
for index in range(t):
|
143 |
+
frames[index] = cv2.flip(frames[index], 1)
|
144 |
+
return frames
|
145 |
+
|
146 |
+
|
147 |
+
class NormalizeUtterance():
|
148 |
+
"""Normalize per raw audio by removing the mean and divided by the standard deviation
|
149 |
+
"""
|
150 |
+
# z-score 정규화를 실행
|
151 |
+
|
152 |
+
def __call__(self, signal):
|
153 |
+
signal_std = 0. if np.std(signal)==0. else np.std(signal)
|
154 |
+
signal_mean = np.mean(signal)
|
155 |
+
return (signal - signal_mean) / signal_std
|
156 |
+
|
157 |
+
|
158 |
+
class AddNoise(object):
|
159 |
+
"""Add SNR noise [-1, 1]
|
160 |
+
"""
|
161 |
+
# snr(signal-to-noise ratio) : 신호 대 잡음 비, 이 값이 클수록
|
162 |
+
|
163 |
+
def __init__(self, noise, snr_levels=[-5, 0, 5, 10, 15, 20, 9999]):
|
164 |
+
assert noise.dtype in [np.float32, np.float64], "noise only supports float data type" # noise는 dtype만 지원한다.
|
165 |
+
|
166 |
+
self.noise = noise
|
167 |
+
self.snr_levels = snr_levels
|
168 |
+
|
169 |
+
def get_power(self, clip):
|
170 |
+
clip2 = clip.copy()
|
171 |
+
clip2 = clip2 **2
|
172 |
+
return np.sum(clip2) / (len(clip2) * 1.0)
|
173 |
+
|
174 |
+
def __call__(self, signal):
|
175 |
+
assert signal.dtype in [np.float32, np.float64], "signal only supports float32 data type" # signal은 dtype만 지원한다.
|
176 |
+
snr_target = random.choice(self.snr_levels)
|
177 |
+
if snr_target == 9999:
|
178 |
+
return signal
|
179 |
+
else:
|
180 |
+
# -- get noise
|
181 |
+
start_idx = random.randint(0, len(self.noise)-len(signal))
|
182 |
+
noise_clip = self.noise[start_idx:start_idx+len(signal)]
|
183 |
+
|
184 |
+
sig_power = self.get_power(signal)
|
185 |
+
noise_clip_power = self.get_power(noise_clip)
|
186 |
+
factor = (sig_power / noise_clip_power ) / (10**(snr_target / 10.0))
|
187 |
+
desired_signal = (signal + noise_clip*np.sqrt(factor)).astype(np.float32)
|
188 |
+
return desired_signal
|
lipreading/utils.py
ADDED
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
import datetime
|
6 |
+
import logging
|
7 |
+
|
8 |
+
import json
|
9 |
+
import torch
|
10 |
+
import shutil
|
11 |
+
|
12 |
+
|
13 |
+
def calculateNorm2(model):
|
14 |
+
para_norm = 0.
|
15 |
+
for p in model.parameters():
|
16 |
+
para_norm += p.data.norm(2)
|
17 |
+
print('2-norm of the neural network: {:.4f}'.format(para_norm**.5))
|
18 |
+
|
19 |
+
|
20 |
+
def showLR(optimizer):
|
21 |
+
return optimizer.param_groups[0]['lr']
|
22 |
+
|
23 |
+
|
24 |
+
class AverageMeter(object):
|
25 |
+
"""Computes and stores the average and current value"""
|
26 |
+
|
27 |
+
def __init__(self):
|
28 |
+
self.reset()
|
29 |
+
|
30 |
+
def reset(self):
|
31 |
+
self.val = 0
|
32 |
+
self.avg = 0
|
33 |
+
self.sum = 0
|
34 |
+
self.count = 0
|
35 |
+
|
36 |
+
def update(self, val, n=1):
|
37 |
+
self.val = val
|
38 |
+
self.sum += val * n
|
39 |
+
self.count += n
|
40 |
+
self.avg = self.sum / self.count
|
41 |
+
|
42 |
+
|
43 |
+
# -- IO utils
|
44 |
+
def read_txt_lines(filepath):
|
45 |
+
assert os.path.isfile( filepath ), "Error when trying to read txt file, path does not exist: {}".format(filepath)
|
46 |
+
with open( filepath ) as myfile:
|
47 |
+
content = myfile.read().splitlines()
|
48 |
+
return content
|
49 |
+
|
50 |
+
|
51 |
+
def save_as_json(d, filepath):
|
52 |
+
with open(filepath, 'w') as outfile:
|
53 |
+
json.dump(d, outfile, indent=4, sort_keys=True)
|
54 |
+
|
55 |
+
|
56 |
+
def load_json( json_fp ):
|
57 |
+
assert os.path.isfile( json_fp ), "Error loading JSON. File provided does not exist, cannot read: {}".format( json_fp )
|
58 |
+
with open( json_fp, 'r' ) as f:
|
59 |
+
json_content = json.load(f)
|
60 |
+
return json_content
|
61 |
+
|
62 |
+
|
63 |
+
def save2npz(filename, data=None):
|
64 |
+
assert data is not None, "data is {}".format(data)
|
65 |
+
if not os.path.exists(os.path.dirname(filename)):
|
66 |
+
os.makedirs(os.path.dirname(filename))
|
67 |
+
np.savez_compressed(filename, data=data)
|
68 |
+
|
69 |
+
|
70 |
+
# -- checkpoints
|
71 |
+
class CheckpointSaver:
|
72 |
+
def __init__(self, save_dir, checkpoint_fn='ckpt.pth.tar', best_fn='ckpt.best.pth.tar', best_step_fn='ckpt.best.step{}.pth.tar', save_best_step=False, lr_steps=[]):
|
73 |
+
"""
|
74 |
+
Only mandatory: save_dir
|
75 |
+
Can configure naming of checkpoint files through checkpoint_fn, best_fn and best_stage_fn
|
76 |
+
If you want to keep best-performing checkpoint per step
|
77 |
+
"""
|
78 |
+
|
79 |
+
self.save_dir = save_dir
|
80 |
+
|
81 |
+
# checkpoint names
|
82 |
+
self.checkpoint_fn = checkpoint_fn
|
83 |
+
self.best_fn = best_fn
|
84 |
+
self.best_step_fn = best_step_fn
|
85 |
+
|
86 |
+
# save best per step?
|
87 |
+
self.save_best_step = save_best_step
|
88 |
+
self.lr_steps = []
|
89 |
+
|
90 |
+
# init var to keep track of best performing checkpoint
|
91 |
+
self.current_best = 0
|
92 |
+
|
93 |
+
# save best at each step?
|
94 |
+
if self.save_best_step:
|
95 |
+
assert lr_steps != [], "Since save_best_step=True, need proper value for lr_steps. Current: {}".format(lr_steps)
|
96 |
+
self.best_for_stage = [0]*(len(lr_steps)+1)
|
97 |
+
|
98 |
+
def save(self, save_dict, current_perf, epoch=-1):
|
99 |
+
"""
|
100 |
+
Save checkpoint and keeps copy if current perf is best overall or [optional] best for current LR step
|
101 |
+
"""
|
102 |
+
|
103 |
+
# save last checkpoint
|
104 |
+
checkpoint_fp = os.path.join(self.save_dir, self.checkpoint_fn)
|
105 |
+
|
106 |
+
# keep track of best model
|
107 |
+
self.is_best = current_perf > self.current_best
|
108 |
+
if self.is_best:
|
109 |
+
self.current_best = current_perf
|
110 |
+
best_fp = os.path.join(self.save_dir, self.best_fn)
|
111 |
+
save_dict['best_prec'] = self.current_best
|
112 |
+
|
113 |
+
# keep track of best-performing model per step [optional]
|
114 |
+
if self.save_best_step:
|
115 |
+
|
116 |
+
assert epoch >= 0, "Since save_best_step=True, need proper value for 'epoch'. Current: {}".format(epoch)
|
117 |
+
s_idx = sum( epoch >= l for l in lr_steps )
|
118 |
+
self.is_best_for_stage = current_perf > self.best_for_stage[s_idx]
|
119 |
+
|
120 |
+
if self.is_best_for_stage:
|
121 |
+
self.best_for_stage[s_idx] = current_perf
|
122 |
+
best_stage_fp = os.path.join(self.save_dir, self.best_stage_fn.format(s_idx))
|
123 |
+
save_dict['best_prec_per_stage'] = self.best_for_stage
|
124 |
+
|
125 |
+
# save
|
126 |
+
torch.save(save_dict, checkpoint_fp)
|
127 |
+
print("Checkpoint saved at {}".format(checkpoint_fp))
|
128 |
+
if self.is_best:
|
129 |
+
shutil.copyfile(checkpoint_fp, best_fp)
|
130 |
+
if self.save_best_step and self.is_best_for_stage:
|
131 |
+
shutil.copyfile(checkpoint_fp, best_stage_fp)
|
132 |
+
|
133 |
+
|
134 |
+
def set_best_from_ckpt(self, ckpt_dict):
|
135 |
+
self.current_best = ckpt_dict['best_prec']
|
136 |
+
self.best_for_stage = ckpt_dict.get('best_prec_per_stage',None)
|
137 |
+
|
138 |
+
|
139 |
+
def load_model(load_path, model, optimizer = None, allow_size_mismatch = False):
|
140 |
+
"""
|
141 |
+
Load model from file
|
142 |
+
If optimizer is passed, then the loaded dictionary is expected to contain also the states of the optimizer.
|
143 |
+
If optimizer not passed, only the model weights will be loaded
|
144 |
+
"""
|
145 |
+
|
146 |
+
# -- load dictionary
|
147 |
+
assert os.path.isfile( load_path ), "Error when loading the model, provided path not found: {}".format( load_path )
|
148 |
+
checkpoint = torch.load(load_path)
|
149 |
+
loaded_state_dict = checkpoint['model_state_dict']
|
150 |
+
|
151 |
+
if allow_size_mismatch:
|
152 |
+
loaded_sizes = { k: v.shape for k,v in loaded_state_dict.items() }
|
153 |
+
model_state_dict = model.state_dict()
|
154 |
+
model_sizes = { k: v.shape for k,v in model_state_dict.items() }
|
155 |
+
mismatched_params = []
|
156 |
+
for k in loaded_sizes:
|
157 |
+
if loaded_sizes[k] != model_sizes[k]:
|
158 |
+
mismatched_params.append(k)
|
159 |
+
for k in mismatched_params:
|
160 |
+
del loaded_state_dict[k]
|
161 |
+
|
162 |
+
# -- copy loaded state into current model and, optionally, optimizer
|
163 |
+
model.load_state_dict(loaded_state_dict, strict = not allow_size_mismatch)
|
164 |
+
if optimizer is not None:
|
165 |
+
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
166 |
+
return model, optimizer, checkpoint['epoch_idx'], checkpoint
|
167 |
+
return model
|
168 |
+
|
169 |
+
|
170 |
+
# -- logging utils
|
171 |
+
def get_logger(args,save_path):
|
172 |
+
log_path = '{}/{}_{}_{}classes_log.txt'.format(save_path,args.training_mode,args.lr,args.num_classes)
|
173 |
+
logger = logging.getLogger("mylog")
|
174 |
+
logger.setLevel(logging.INFO)
|
175 |
+
fh = logging.FileHandler(log_path)
|
176 |
+
fh.setLevel(logging.INFO)
|
177 |
+
logger.addHandler(fh)
|
178 |
+
console = logging.StreamHandler()
|
179 |
+
console.setLevel(logging.INFO)
|
180 |
+
logger.addHandler(console)
|
181 |
+
return logger
|
182 |
+
|
183 |
+
|
184 |
+
def update_logger_batch( args, logger, dset_loader, batch_idx, running_loss, running_corrects, running_all, batch_time, data_time ):
|
185 |
+
perc_epoch = 100. * batch_idx / (len(dset_loader)-1)
|
186 |
+
logger.info('[{:5.0f}/{:5.0f} ({:.0f}%)]\tLoss: {:.4f}\tAcc:{:.4f}\tCost time:{:1.3f} ({:1.3f})s\tData time:{:1.3f} ({:1.3f})\tInstances per second: {:.2f}'.format(
|
187 |
+
running_all,
|
188 |
+
len(dset_loader.dataset),
|
189 |
+
perc_epoch,
|
190 |
+
running_loss / running_all,
|
191 |
+
running_corrects / running_all,
|
192 |
+
batch_time.val, batch_time.avg,
|
193 |
+
data_time.val, data_time.avg,
|
194 |
+
args.batch_size/batch_time.avg ))
|
195 |
+
|
196 |
+
|
197 |
+
def get_save_folder( args):
|
198 |
+
# create save and log folder
|
199 |
+
save_path = '{}/{}'.format( args.logging_dir, args.training_mode )
|
200 |
+
save_path += '/' + datetime.datetime.now().isoformat().split('.')[0]
|
201 |
+
if not os.path.isdir(save_path):
|
202 |
+
os.makedirs(save_path)
|
203 |
+
return save_path
|