Spaces:

aiface
/

doc_moi_tieng_Viet

Sleeping

App Files Files Community

aiface commited on Mar 4, 2023

Commit

907b7f3

1 Parent(s): ecbdaa0

Upload 11 files

Browse files

Files changed (11) hide show

lipreading/dataloaders.py +64 -0
lipreading/dataset.py +176 -0
lipreading/mixup.py +28 -0
lipreading/model.py +153 -0
lipreading/models/resnet.py +135 -0
lipreading/models/resnet1D.py +143 -0
lipreading/models/shufflenetv2.py +178 -0
lipreading/models/tcn.py +255 -0
lipreading/optim_utils.py +31 -0
lipreading/preprocess.py +188 -0
lipreading/utils.py +203 -0

lipreading/dataloaders.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import numpy as np
+from lipreading.preprocess import *
+from lipreading.dataset import MyDataset, pad_packed_collate
+def get_preprocessing_pipelines(modality='video'):
+    # -- preprocess for the video stream
+    preprocessing = {}
+    # -- LRW config
+    if modality == 'video':
+        crop_size = (88, 88)
+        (mean, std) = (0.421, 0.165)
+        # train :
+        preprocessing['train'] = Compose([                       # 여러 개의 preprocess를 사용할 때 Compose()를 사용한다. preprocess.py에 설정되어 있음
+                                    Normalize(0.0,255.0),
+                                    RandomCrop(crop_size),
+                                    HorizontalFlip(0.5),
+                                    Normalize(mean, std) ])
+        preprocessing['val'] = Compose([
+                                    Normalize( 0.0,255.0 ),
+                                    CenterCrop(crop_size),
+                                    Normalize(mean, std) ])
+        preprocessing['test'] = preprocessing['val']   # test와 val이 같다
+    elif modality == 'raw_audio':
+        preprocessing['train'] = Compose([
+                                    AddNoise( noise=np.load('./data/babbleNoise_resample_16K.npy')),   # train에만 노이즈를 추가해 준다.
+                                    NormalizeUtterance()])
+        preprocessing['val'] = NormalizeUtterance()   # z-score 정규화를 수행
+        preprocessing['test'] = NormalizeUtterance()
+    return preprocessing
+def get_data_loaders(args):
+    preprocessing = get_preprocessing_pipelines( args.modality)
+    # create dataset object for each partition
+    dsets = {partition: MyDataset(
+                modality=args.modality,
+                data_partition=partition,
+                data_dir=args.data_dir,
+                label_fp=args.label_path,
+                annonation_direc=args.annonation_direc,
+                preprocessing_func=preprocessing[partition],
+                data_suffix='.npz'
+                ) for partition in ['train', 'val', 'test']}
+    dset_loaders = {x: torch.utils.data.DataLoader(
+                        dsets[x],
+                        batch_size=args.batch_size,
+                        shuffle=True,
+                        collate_fn=pad_packed_collate,
+                        pin_memory=True,
+                        num_workers=args.workers,
+                        worker_init_fn=np.random.seed(1)) for x in ['train', 'val', 'test']}
+    return dset_loaders

lipreading/dataset.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import os
+import glob
+import torch
+import random
+import librosa
+import numpy as np
+import sys
+from lipreading.utils import read_txt_lines
+# dataloaders.py에서 사용된 MyDataset
+# dsets = {partition: MyDataset(
+#                 modality=args.modality,
+#                 data_partition=partition,
+#                 data_dir=args.data_dir,
+#                 label_fp=args.label_path,
+#                 annonation_direc=args.annonation_direc,
+#                 preprocessing_func=preprocessing[partition],
+#                 data_suffix='.npz'
+#                 ) for partition in ['train', 'val', 'test']}
+class MyDataset(object):
+    def __init__(self, modality, data_partition, data_dir, label_fp, annonation_direc=None,
+        preprocessing_func=None, data_suffix='.npz'):
+        assert os.path.isfile( label_fp ), "File path provided for the labels does not exist. Path iput: {}".format(label_fp)
+        self._data_partition = data_partition
+        self._data_dir = data_dir
+        self._data_suffix = data_suffix
+        self._label_fp = label_fp
+        self._annonation_direc = annonation_direc
+        self.fps = 25 if modality == "video" else 16000
+        self.is_var_length = True
+        self.label_idx = -3
+        self.preprocessing_func = preprocessing_func
+        self._data_files = []
+        self.load_dataset()
+    def load_dataset(self):
+        # -- read the labels file
+        self._labels = read_txt_lines(self._label_fp)
+        # -- add examples to self._data_files
+        self._get_files_for_partition()
+        # -- from self._data_files to self.list
+        self.list = dict()
+        self.instance_ids = dict()
+        for i, x in enumerate(self._data_files):
+            label = self._get_label_from_path( x )
+            self.list[i] = [ x, self._labels.index( label ) ]
+            self.instance_ids[i] = self._get_instance_id_from_path( x )
+        print('Partition {} loaded'.format(self._data_partition))
+    def _get_instance_id_from_path(self, x):
+        # for now this works for npz/npys, might break for image folders
+        instance_id = x.split('/')[-1]
+        return os.path.splitext( instance_id )[0]
+    def _get_label_from_path(self, x):
+        return x.split('/')[self.label_idx]
+    def _get_files_for_partition(self):  ##### 여기 확인!!
+        # get rgb/mfcc file paths
+        dir_fp = self._data_dir
+        if not dir_fp:
+            return
+        # get npy/npz/mp4 files
+        search_str_npz = os.path.join(dir_fp, '*', self._data_partition, '*.npz')   # npz : 여러개의 리스트를 한번에 저장하기 위한 포맷
+        search_str_npy = os.path.join(dir_fp, '*', self._data_partition, '*.npy')   # npy : 하나의 numpy array를 저장하기 위한 포맷
+        search_str_mp4 = os.path.join(dir_fp, '*', self._data_partition, '*.mp4')
+        self._data_files.extend( glob.glob( search_str_npz ) )   # list.extend() : npz파일명을 _data_files에 추가한다.
+        self._data_files.extend( glob.glob( search_str_npy ) )   # list.extend() : npy파일명을 _data_files에 추가한다.
+        self._data_files.extend( glob.glob( search_str_mp4 ) )   # list.extend() : mp4파일명을 _data_files에 추가한다.
+        # If we are not using the full set of labels, remove examples for labels not used
+        self._data_files = [ f for f in self._data_files if f.split('/')[self.label_idx] in self._labels ]
+    def load_data(self, filename):
+        try:
+            if filename.endswith('npz'):    # endswith(문자열) : 해당 문자열로 끝나는지 여부를 true/false로 반환
+                # return np.load(filename, allow_pickle=True)['data']
+                return np.load(filename)['data']
+            elif filename.endswith('mp4'):
+                return librosa.load(filename, sr=16000)[0][-19456:]
+                # librosa.load() : wav파일을 읽을 때 사용. librosa로 데이터를 읽으면 범위가 -1 ~ 1로 정규화 된다.
+                # sr : sampling rate (주파수 분석 및 파형의 시간 간격을 결정)
+                # 비디오의 경우 : 1초에 보이는 프레임이 몇 개인가
+                # 오디오의 경우 : 프레임이 아닌 샘플이라고 부른다. 단위는 Hz
+                # sr이 높은 것이 음질이 좋다.
+                # https://wiserloner.tistory.com/1194
+                # 16,000 Hz : 표준 전화 협대역인 8,000 Hz보다 높은 광대역 주파수 확장. VoIP
+            else:
+                return np.load(filename)
+        except IOError:
+            print("Error when reading file: {}".format(filename))
+            sys.exit()
+    def _apply_variable_length_aug(self, filename, raw_data):
+        # read info txt file (to see duration of word, to be used to do temporal cropping)
+        info_txt = os.path.join(self._annonation_direc, *filename.split('/')[self.label_idx:] )  # swap base folder
+        info_txt = os.path.splitext( info_txt )[0] + '.txt'   # swap extension
+        info = read_txt_lines(info_txt)
+        utterance_duration = float( info[4].split(' ')[1] )
+        half_interval = int(utterance_duration/2.0 * self.fps)  # num frames of utterance / 2
+        n_frames = raw_data.shape[0]
+        mid_idx = ( n_frames -1 ) // 2   # video has n frames, mid point is (n-1)//2 as count starts with 0
+        left_idx = random.randint(0, max(0,mid_idx-half_interval-1))    # random.randint(a,b) chooses in [a,b]
+        right_idx = random.randint(min( mid_idx+half_interval+1, n_frames ), n_frames)
+        return raw_data[left_idx:right_idx]
+    def __getitem__(self, idx):
+        raw_data = self.load_data(self.list[idx][0])
+        # -- perform variable length on training set
+        if ( self._data_partition == 'train' ) and self.is_var_length:
+            data = self._apply_variable_length_aug(self.list[idx][0], raw_data)
+        else:
+            data = raw_data
+        preprocess_data = self.preprocessing_func(data)
+        label = self.list[idx][1]
+        return preprocess_data, label
+    def __len__(self):
+        return len(self._data_files)
+def pad_packed_collate(batch):
+    batch = np.array(batch, dtype=object)  # list 라서 numpy 로 변경, 내부 요소 리스트 길이가 달라서 dytpe=object 설정하는 코드 추가
+    if len(batch) == 1:
+        data, lengths, labels_np, = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
+        data = torch.FloatTensor(data)
+        lengths = [data.size(1)]
+    if len(batch) > 1:
+        data_list, lengths, labels_np = zip(*[(a, a.shape[0], b) for (a, b) in sorted(batch, key=lambda x: x[0].shape[0], reverse=True)])
+        data_np = 0  # data_np 변수 초기화하는 코드 추가
+        if data_list[0].ndim == 3:
+            max_len, h, w = data_list[0].shape  # since it is sorted, the longest video is the first one
+            data_np = np.zeros(( len(data_list), max_len, h, w))
+        elif data_list[0].ndim == 1:
+            max_len = data_list[0].shape[0]
+            data_np = np.zeros( (len(data_list), max_len))
+        for idx in range( len(data_np)):
+            data_np[idx][:data_list[idx].shape[0]] = data_list[idx]
+        data = torch.FloatTensor(data_np)
+    labels = torch.LongTensor(labels_np)
+    return data, lengths, labels

lipreading/mixup.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+import numpy as np
+# -- mixup data augmentation # mixup augmentation 계산
+# from https://github.com/hongyi-zhang/mixup/blob/master/cifar/utils.py
+def mixup_data(x, y, alpha=1.0, soft_labels = None, use_cuda=False):
+    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda'''
+    if alpha > 0.:
+        lam = np.random.beta(alpha, alpha)  # 베타 분포에서 표본 추출
+    else:
+        lam = 1.
+    batch_size = x.size()[0]
+    if use_cuda:
+        index = torch.randperm(batch_size).cuda()  # 주어진 범위 내의 정수를 랜덤하게 생성 # tensor 를 gpu 에 할당
+    else:
+        index = torch.randperm(batch_size)  # 주어진 범위 내의 정수를 랜덤하게 생성
+    mixed_x = lam * x + (1 - lam) * x[index,:]
+    y_a, y_b = y, y[index]
+    return mixed_x, y_a, y_b, lam
+# mixup 적용
+def mixup_criterion(y_a, y_b, lam):
+    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

lipreading/model.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+import torch.nn as nn
+import math
+import numpy as np
+from lipreading.models.resnet import ResNet, BasicBlock
+from lipreading.models.resnet1D import ResNet1D, BasicBlock1D
+from lipreading.models.shufflenetv2 import ShuffleNetV2
+from lipreading.models.tcn import MultibranchTemporalConvNet, TemporalConvNet
+# -- auxiliary functions
+def threeD_to_2D_tensor(x):
+    n_batch, n_channels, s_time, sx, sy = x.shape
+    x = x.transpose(1, 2)
+    return x.reshape(n_batch*s_time, n_channels, sx, sy)
+def _average_batch(x, lengths, B):
+    return torch.stack( [torch.mean( x[index][:,0:i], 1 ) for index, i in enumerate(lengths)],0 )
+class MultiscaleMultibranchTCN(nn.Module):
+    def __init__(self, input_size, num_channels, num_classes, tcn_options, dropout, relu_type, dwpw=False):
+        super(MultiscaleMultibranchTCN, self).__init__()
+        self.kernel_sizes = tcn_options['kernel_size']
+        self.num_kernels = len( self.kernel_sizes )
+        self.mb_ms_tcn = MultibranchTemporalConvNet(input_size, num_channels, tcn_options, dropout=dropout, relu_type=relu_type, dwpw=dwpw)
+        self.tcn_output = nn.Linear(num_channels[-1], num_classes)
+        self.consensus_func = _average_batch
+    def forward(self, x, lengths, B):
+        # x needs to have dimension (N, C, L) in order to be passed into CNN
+        xtrans = x.transpose(1, 2)
+        out = self.mb_ms_tcn(xtrans)
+        out = self.consensus_func( out, lengths, B )
+        return self.tcn_output(out)
+class TCN(nn.Module):
+    """Implements Temporal Convolutional Network (TCN)
+    __https://arxiv.org/pdf/1803.01271.pdf
+    """
+    def __init__(self, input_size, num_channels, num_classes, tcn_options, dropout, relu_type, dwpw=False):
+        super(TCN, self).__init__()
+        self.tcn_trunk = TemporalConvNet(input_size, num_channels, dropout=dropout, tcn_options=tcn_options, relu_type=relu_type, dwpw=dwpw)
+        self.tcn_output = nn.Linear(num_channels[-1], num_classes)
+        self.consensus_func = _average_batch
+        self.has_aux_losses = False
+    def forward(self, x, lengths, B):
+        # x needs to have dimension (N, C, L) in order to be passed into CNN
+        x = self.tcn_trunk(x.transpose(1, 2))
+        x = self.consensus_func( x, lengths, B )
+        return self.tcn_output(x)
+class Lipreading(nn.Module):
+    def __init__( self, modality='video', hidden_dim=256, backbone_type='resnet', num_classes=30,
+                  relu_type='prelu', tcn_options={}, width_mult=1.0, extract_feats=False):
+        super(Lipreading, self).__init__()
+        self.extract_feats = extract_feats
+        self.backbone_type = backbone_type
+        self.modality = modality
+        if self.modality == 'raw_audio':
+            self.frontend_nout = 1
+            self.backend_out = 512
+            self.trunk = ResNet1D(BasicBlock1D, [2, 2, 2, 2], relu_type=relu_type)
+        elif self.modality == 'video':
+            if self.backbone_type == 'resnet':
+                self.frontend_nout = 64
+                self.backend_out = 512
+                self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=relu_type)
+            elif self.backbone_type == 'shufflenet':
+                assert width_mult in [0.5, 1.0, 1.5, 2.0], "Width multiplier not correct"
+                shufflenet = ShuffleNetV2( input_size=96, width_mult=width_mult)
+                self.trunk = nn.Sequential( shufflenet.features, shufflenet.conv_last, shufflenet.globalpool)
+                self.frontend_nout = 24
+                self.backend_out = 1024 if width_mult != 2.0 else 2048
+                self.stage_out_channels = shufflenet.stage_out_channels[-1]
+            frontend_relu = nn.PReLU(num_parameters=self.frontend_nout) if relu_type == 'prelu' else nn.ReLU()
+            self.frontend3D = nn.Sequential(
+                        nn.Conv3d(1, self.frontend_nout, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False),
+                        nn.BatchNorm3d(self.frontend_nout),
+                        frontend_relu,
+                        nn.MaxPool3d( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
+        else:
+            raise NotImplementedError
+        tcn_class = TCN if len(tcn_options['kernel_size']) == 1 else MultiscaleMultibranchTCN
+        self.tcn = tcn_class( input_size=self.backend_out,
+                              num_channels=[hidden_dim*len(tcn_options['kernel_size'])*tcn_options['width_mult']]*tcn_options['num_layers'],
+                              num_classes=num_classes,
+                              tcn_options=tcn_options,
+                              dropout=tcn_options['dropout'],
+                              relu_type=relu_type,
+                              dwpw=tcn_options['dwpw'],
+                            )
+        # -- initialize
+        self._initialize_weights_randomly()
+    def forward(self, x, lengths):
+        if self.modality == 'video':
+            B, C, T, H, W = x.size()
+            x = self.frontend3D(x)
+            Tnew = x.shape[2]    # output should be B x C2 x Tnew x H x W
+            x = threeD_to_2D_tensor( x )
+            x = self.trunk(x)
+            if self.backbone_type == 'shufflenet':
+                x = x.view(-1, self.stage_out_channels)
+            x = x.view(B, Tnew, x.size(1))
+        elif self.modality == 'raw_audio':
+            B, C, T = x.size()
+            x = self.trunk(x)
+            x = x.transpose(1, 2)
+            lengths = [_//640 for _ in lengths]
+        return x if self.extract_feats else self.tcn(x, lengths, B)
+    def _initialize_weights_randomly(self):
+        use_sqrt = True
+        if use_sqrt:
+            def f(n):
+                return math.sqrt( 2.0/float(n) )
+        else:
+            def f(n):
+                return 2.0/float(n)
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d) or isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
+                n = np.prod( m.kernel_size ) * m.out_channels
+                m.weight.data.normal_(0, f(n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm3d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = float(m.weight.data[0].nelement())
+                m.weight.data = m.weight.data.normal_(0, f(n))

lipreading/models/resnet.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import math
+import torch.nn as nn
+import pdb  # 파이썬 디버거
+# Conv2D (3,3)
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+# Conv2D (1,1) + BatchNorm2D
+def downsample_basic_block( inplanes, outplanes, stride ):
+    return  nn.Sequential(
+                nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(outplanes),
+            )
+# AvgPool2D + Conv2D (1,1) + BatchNorm2D
+def downsample_basic_block_v2( inplanes, outplanes, stride ):
+    return  nn.Sequential(
+                nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
+                nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(outplanes),
+            )
+# 기본 블럭 2D
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
+        super(BasicBlock, self).__init__()
+        # relu_type 변수 값이 'relu','prelu' 인지 확인, 아니면 AssertionError 메시지를 띄움
+        assert relu_type in ['relu','prelu']  # 원하는 조건의 변수값을 보증하기 위해 사용
+        self.conv1 = conv3x3(inplanes, planes, stride)  # Conv2D (3,3)
+        self.bn1 = nn.BatchNorm2d(planes)  # BatchNorm2D
+        # type of ReLU is an input option
+        if relu_type == 'relu':  # ReLU
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == 'prelu':  # PReLU
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        else:
+            raise Exception('relu type not implemented')  # 에러 발생시키기
+        # --------
+        self.conv2 = conv3x3(planes, planes)  # Conv2D (3,3)
+        self.bn2 = nn.BatchNorm2d(planes)  # BatchNorm2D
+        self.downsample = downsample
+        self.stride = stride
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu2(out)
+        return out
+# 레즈넷 2D
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, relu_type = 'relu', gamma_zero = False, avg_pool_downsample = False):
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.gamma_zero = gamma_zero
+        self.downsample_block = downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block  # AvgPool2D 적용하면 v2 아니면 v1
+        super(ResNet, self).__init__()
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        # default init
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):  # Conv2D 인스턴스인가
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):  # BatchNrom2D 인스턴스인가
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+                #nn.init.ones_(m.weight)
+                #nn.init.zeros_(m.bias)
+        if self.gamma_zero:
+            for m in self.modules():
+                if isinstance(m, BasicBlock ):  # 기본 블럭 인스턴스인가
+                    m.bn2.weight.data.zero_()
+    # 레이어 생성
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block( inplanes = self.inplanes,
+                                                 outplanes = planes * block.expansion,
+                                                 stride = stride )  # (AvgPool2D) + Conv2D (1,1) + BatchNorm2D
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
+        return nn.Sequential(*layers)  # 설정한 레이어 반환
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        return x

lipreading/models/resnet1D.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import math
+import torch.nn as nn
+import pdb  # 파이썬 디버거
+# Conv1D (3,3)
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv1d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+# Conv1D (1,1) + BatchNorm1D
+def downsample_basic_block( inplanes, outplanes, stride ):
+    return  nn.Sequential(
+                nn.Conv1d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm1d(outplanes),
+            )
+# AvgPool1D + Conv1D (1,1) + BatchNorm1D
+def downsample_basic_block_v2( inplanes, outplanes, stride ):
+    return  nn.Sequential(
+                nn.AvgPool1d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False),
+                nn.Conv1d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm1d(outplanes),
+            )
+# 기본 블럭 1D
+class BasicBlock1D(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type = 'relu' ):
+        super(BasicBlock1D, self).__init__()
+        # relu_type 변수 값이 'relu','prelu' 인지 확인, 아니면 AssertionError 메시지를 띄움
+        assert relu_type in ['relu','prelu']  # 원하는 조건의 변수값을 보증하기 위해 사용
+        self.conv1 = conv3x3(inplanes, planes, stride)  # Conv1D (3,3)
+        self.bn1 = nn.BatchNorm1d(planes)  # BatchNorm1D
+        # type of ReLU is an input option
+        if relu_type == 'relu':  # ReLU
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == 'prelu':  # PReLU
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        else:
+            raise Exception('relu type not implemented')  # 에러 발생시키기
+        # --------
+        self.conv2 = conv3x3(planes, planes)  # Conv1D (3,3)
+        self.bn2 = nn.BatchNorm1d(planes)  # BatchNorm1D
+        self.downsample = downsample
+        self.stride = stride
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu2(out)
+        return out
+# 레즈넷1D
+class ResNet1D(nn.Module):
+    def __init__(self, block, layers, relu_type = 'relu'):
+        super(ResNet1D, self).__init__()
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.downsample_block = downsample_basic_block
+        self.conv1 = nn.Conv1d(1, self.inplanes, kernel_size=80, stride=4, padding=38,
+                               bias=False)  # Conv1D
+        self.bn1 = nn.BatchNorm1d(self.inplanes)  # BatchNorm1D
+        # type of ReLU is an input option
+        if relu_type == 'relu':  # ReLU
+            self.relu = nn.ReLU(inplace=True)
+        elif relu_type == 'prelu':  # PReLU
+            self.relu = nn.PReLU(num_parameters=self.inplanes)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        # For LRW, we downsample the sampling rate to 25fps
+        self.avgpool = nn.AvgPool1d(kernel_size=21, padding=1)
+        '''
+        # The following pooling setting is the general configuration  # 일반 구성 AvgPool1D
+        self.avgpool = nn.AvgPool1d(kernel_size=20, stride=20)
+        '''
+        # default init
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):  # Conv1D 인스턴스인가
+                n = m.kernel_size[0] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm1d):  # BatchNrom1D 인스턴스인가
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    # 레이어 생성
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block( inplanes = self.inplanes,
+                                                 outplanes = planes * block.expansion,
+                                                 stride = stride )  # (AvgPool1D) + Conv1D (1,1) + BatchNorm1D
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, relu_type = self.relu_type))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, relu_type = self.relu_type))
+        return nn.Sequential(*layers)  # 설정한 레이어 반환
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        return x

lipreading/models/shufflenetv2.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from collections import OrderedDict
+from torch.nn import init
+import math
+import pdb  # 파이썬 디버거
+# Conv2D (3,3) + BatchNorm2D + ReLU
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+# Conv2D (1,1) + BatchNorm2D + ReLU
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU(inplace=True)
+    )
+# reshape -> flatten
+def channel_shuffle(x, groups):
+    batchsize, num_channels, height, width = x.data.size()  # data 정보
+    channels_per_group = num_channels // groups  # 그룹당 채널 계산
+    # reshape
+    x = x.view(batchsize, groups, # reshape 적용된 모양의 tensor 반환 # 원본 data 공유
+        channels_per_group, height, width)
+    x = torch.transpose(x, 1, 2).contiguous()  # transpose(): 2개의 차원 맞교환 # contiguous(): 원본과 다른 새로운 주소로 할당
+    # flatten => [batchsize, height * width]
+    x = x.view(batchsize, -1, height, width)  # reshape 적용된 모양의 tensor 반환 # 원본 data 공유
+    return x
+# Inverted Residual - 관련 모델: MobileNetV2
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, benchmodel):
+        super(InvertedResidual, self).__init__()
+        self.benchmodel = benchmodel
+        self.stride = stride
+        # stride 가 [1,2] 인지 확인, 아니면 AssertionError 메시지를 띄움
+        assert stride in [1, 2]  # 원하는 조건의 변수값을 보증하기 위해 사용
+        oup_inc = oup//2
+        if self.benchmodel == 1:
+            #assert inp == oup_inc
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),  # Conv2D (1,1)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                nn.ReLU(inplace=True),  # ReLU
+                # dw
+                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),  # Conv2D (3,3)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                # pw-linear
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),  # Conv2D (1,1)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                nn.ReLU(inplace=True),  # ReLU
+            )
+        else:
+            self.banch1 = nn.Sequential(
+                # dw
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),  # Conv2D (3,3)
+                nn.BatchNorm2d(inp),  # BatchNorm2D
+                # pw-linear
+                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),  # Conv2D (1,1)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                nn.ReLU(inplace=True),  # ReLU
+            )
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, oup_inc, 1, 1, 0, bias=False),  # Conv2D (1,1)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                nn.ReLU(inplace=True),  # ReLU
+                # dw
+                nn.Conv2d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),  # Conv2D (3,3)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                # pw-linear
+                nn.Conv2d(oup_inc, oup_inc, 1, 1, 0, bias=False),  # Conv2D (1,1)
+                nn.BatchNorm2d(oup_inc),  # BatchNorm2D
+                nn.ReLU(inplace=True),  # ReLU
+            )
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)  # Tensor list를 한번에 tensor로 만들기
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        if 1==self.benchmodel:
+            x1 = x[:, :(x.shape[1]//2), :, :]
+            x2 = x[:, (x.shape[1]//2):, :, :]
+            out = self._concat(x1, self.banch2(x2))
+        elif 2==self.benchmodel:
+            out = self._concat(self.banch1(x), self.banch2(x))
+        return channel_shuffle(out, 2)  # reshape -> flatten
+# 셔플넷 V2
+class ShuffleNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=2.):
+        super(ShuffleNetV2, self).__init__()
+        # 인풋사이즈 % 32 == 0 인지 확인, 아니면 AssertionError 메시지를 띄움
+        assert input_size % 32 == 0, "Input size needs to be divisible by 32"  # 원하는 조건의 변수값을 보증하기 위해 사용
+        self.stage_repeats = [4, 8, 4]
+        # index 0 is invalid and should never be called.
+        # only used for indexing convenience.
+        if width_mult == 0.5:
+            self.stage_out_channels = [-1, 24,  48,  96, 192, 1024]
+        elif width_mult == 1.0:
+            self.stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif width_mult == 1.5:
+            self.stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif width_mult == 2.0:
+            self.stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise ValueError(  # 에러 발생시키기
+                """Width multiplier should be in [0.5, 1.0, 1.5, 2.0]. Current value: {}""".format(width_mult))
+        # building first layer
+        input_channel = self.stage_out_channels[1]
+        self.conv1 = conv_bn(3, input_channel, 2)  # Conv2D (3,3) + BatchNorm2D + ReLU
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  # MaxPool2D
+        self.features = []
+        # building inverted residual blocks
+        for idxstage in range(len(self.stage_repeats)):
+            numrepeat = self.stage_repeats[idxstage]
+            output_channel = self.stage_out_channels[idxstage+2]
+            for i in range(numrepeat):
+                if i == 0:
+                #inp, oup, stride, benchmodel):
+                    self.features.append(InvertedResidual(input_channel, output_channel, 2, 2))
+                else:
+                    self.features.append(InvertedResidual(input_channel, output_channel, 1, 1))
+                input_channel = output_channel
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building last several layers
+        self.conv_last  = conv_1x1_bn(input_channel, self.stage_out_channels[-1])  # Conv2D (1,1) + BatchNorm2D + ReLU
+        self.globalpool = nn.Sequential(nn.AvgPool2d(int(input_size/32)))  # AvgPool2D
+        # building classifier # 선형 회귀 모델
+        self.classifier = nn.Sequential(nn.Linear(self.stage_out_channels[-1], n_class))
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.features(x)
+        x = self.conv_last(x)
+        x = self.globalpool(x)
+        x = x.view(-1, self.stage_out_channels[-1])
+        x = self.classifier(x)
+        return x

lipreading/models/tcn.py ADDED Viewed

	@@ -0,0 +1,255 @@

+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+import pdb
+"""Implements Temporal Convolutional Network (TCN)
+__https://arxiv.org/pdf/1803.01271.pdf
+"""
+# Casual Conv1D
+class Chomp1d(nn.Module):
+    def __init__(self, chomp_size, symm_chomp):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+        self.symm_chomp = symm_chomp
+        if self.symm_chomp:
+            assert self.chomp_size % 2 == 0, "If symmetric chomp, chomp size needs to be even"
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        if self.chomp_size == 0:
+            return x
+        if self.symm_chomp:
+            return x[:, :, self.chomp_size//2:-self.chomp_size//2].contiguous()
+        else:
+            return x[:, :, :-self.chomp_size].contiguous()
+# Conv1D + BatchNorm1D + Casual Conv1D + ReLU
+class ConvBatchChompRelu(nn.Module):
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, relu_type, dwpw=False):
+        super(ConvBatchChompRelu, self).__init__()
+        self.dwpw = dwpw
+        if dwpw:
+            self.conv = nn.Sequential(
+                # -- dw
+                nn.Conv1d( n_inputs, n_inputs, kernel_size, stride=stride,  # Conv1D
+                           padding=padding, dilation=dilation, groups=n_inputs, bias=False),
+                nn.BatchNorm1d(n_inputs),  # BatchNorm1D
+                Chomp1d(padding, True),  # Casual Conv1D
+                nn.PReLU(num_parameters=n_inputs) if relu_type == 'prelu' else nn.ReLU(inplace=True),  # PReLU or ReLU
+                # -- pw
+                nn.Conv1d( n_inputs, n_outputs, 1, 1, 0, bias=False),  # Conv1D
+                nn.BatchNorm1d(n_outputs),  # BatchNorm1D
+                nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True)  # PReLU or ReLU
+            )
+        else:
+            self.conv = nn.Conv1d(n_inputs, n_outputs, kernel_size,  # Conv1D
+                                               stride=stride, padding=padding, dilation=dilation)
+            self.batchnorm = nn.BatchNorm1d(n_outputs)  # BatchNorm1D
+            self.chomp = Chomp1d(padding,True)  # Casual Conv1D
+            self.non_lin = nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU()  # PReLU or ReLU
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        if self.dwpw:
+            return self.conv(x)
+        else:
+            out = self.conv( x )
+            out = self.batchnorm( out )
+            out = self.chomp( out )
+            return self.non_lin( out )
+# --------- MULTI-BRANCH VERSION ---------------
+class MultibranchTemporalBlock(nn.Module):
+    def __init__(self, n_inputs, n_outputs, kernel_sizes, stride, dilation, padding, dropout=0.2,
+                 relu_type = 'relu', dwpw=False):
+        super(MultibranchTemporalBlock, self).__init__()
+        self.kernel_sizes = kernel_sizes
+        self.num_kernels = len( kernel_sizes )
+        self.n_outputs_branch = n_outputs // self.num_kernels
+        assert n_outputs % self.num_kernels == 0, "Number of output channels needs to be divisible by number of kernels"
+        for k_idx,k in enumerate( self.kernel_sizes ):
+            cbcr = ConvBatchChompRelu( n_inputs, self.n_outputs_branch, k, stride, dilation, padding[k_idx], relu_type, dwpw=dwpw)  # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
+            setattr( self,'cbcr0_{}'.format(k_idx), cbcr )  # object 에 존재하는 속성의 값을 바꾸거나 새로운 속성을 생성하여 값을 부여함
+        self.dropout0 = nn.Dropout(dropout)  # Dropout
+        for k_idx,k in enumerate( self.kernel_sizes ):
+            cbcr = ConvBatchChompRelu( n_outputs, self.n_outputs_branch, k, stride, dilation, padding[k_idx], relu_type, dwpw=dwpw)  # Conv1D + BatchNorm1D + Casual Conv1D + ReLU
+            setattr( self,'cbcr1_{}'.format(k_idx), cbcr )  # object 에 존재하는 속성의 값을 바꾸거나 새로운 속성을 생성하여 값을 부여함
+        self.dropout1 = nn.Dropout(dropout)  # Dropout
+        # downsample?
+        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if (n_inputs//self.num_kernels) != n_outputs else None  # Conv1D or None
+        # final relu
+        if relu_type == 'relu':
+            self.relu_final = nn.ReLU()  # ReLU
+        elif relu_type == 'prelu':
+            self.relu_final = nn.PReLU(num_parameters=n_outputs)  # PReLU
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        # first multi-branch set of convolutions
+        outputs = []
+        for k_idx in range( self.num_kernels ):
+            branch_convs = getattr(self,'cbcr0_{}'.format(k_idx))
+            outputs.append( branch_convs(x) )
+        out0 = torch.cat(outputs, 1)
+        out0 = self.dropout0( out0 )
+        # second multi-branch set of convolutions
+        outputs = []
+        for k_idx in range( self.num_kernels ):
+            branch_convs = getattr(self,'cbcr1_{}'.format(k_idx))
+            outputs.append( branch_convs(out0) )
+        out1 = torch.cat(outputs, 1)
+        out1 = self.dropout1( out1 )
+        # downsample?
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu_final(out1 + res)
+class MultibranchTemporalConvNet(nn.Module):
+    def __init__(self, num_inputs, num_channels, tcn_options, dropout=0.2, relu_type='relu', dwpw=False):
+        super(MultibranchTemporalConvNet, self).__init__()
+        self.ksizes = tcn_options['kernel_size']
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i-1]
+            out_channels = num_channels[i]
+            padding = [ (s-1)*dilation_size for s in self.ksizes]
+            layers.append( MultibranchTemporalBlock( in_channels, out_channels, self.ksizes,
+                stride=1, dilation=dilation_size, padding = padding, dropout=dropout, relu_type = relu_type,
+                dwpw=dwpw) )
+        self.network = nn.Sequential(*layers)  # 설정한 레이어 반환
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        return self.network(x)
+# --------------------------------
+# --------------- STANDARD VERSION (SINGLE BRANCH) ------------------------
+class TemporalBlock(nn.Module):
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2,
+                 symm_chomp = False, no_padding = False, relu_type = 'relu', dwpw=False):
+        super(TemporalBlock, self).__init__()
+        self.no_padding = no_padding
+        if self.no_padding:
+            downsample_chomp_size = 2*padding-4
+            padding = 1 # hack-ish thing so that we can use 3 layers
+        if dwpw:
+            self.net = nn.Sequential(
+                # -- first conv set within block
+                # -- dw
+                nn.Conv1d( n_inputs, n_inputs, kernel_size, stride=stride,  # Conv1D
+                           padding=padding, dilation=dilation, groups=n_inputs, bias=False),
+                nn.BatchNorm1d(n_inputs),  # BatchNorm1D
+                Chomp1d(padding, True),  # Casual Conv1D
+                nn.PReLU(num_parameters=n_inputs) if relu_type == 'prelu' else nn.ReLU(inplace=True),  # PReLU or ReLU
+                # -- pw
+                nn.Conv1d( n_inputs, n_outputs, 1, 1, 0, bias=False),  # Conv1D (1,1)
+                nn.BatchNorm1d(n_outputs),  # BatchNorm1D
+                nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True),  # PReLU or ReLU
+                nn.Dropout(dropout),  # Dropout
+                # -- second conv set within block
+                # -- dw
+                nn.Conv1d( n_outputs, n_outputs, kernel_size, stride=stride,  # Conv1D
+                           padding=padding, dilation=dilation, groups=n_outputs, bias=False),
+                nn.BatchNorm1d(n_outputs),  # BatchNorm1D
+                Chomp1d(padding, True),  # Casual Conv1D
+                nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True),  # PReLU or ReLU
+                # -- pw
+                nn.Conv1d( n_outputs, n_outputs, 1, 1, 0, bias=False),  # Conv1D
+                nn.BatchNorm1d(n_outputs),  # BatchNorm1D
+                nn.PReLU(num_parameters=n_outputs) if relu_type == 'prelu' else nn.ReLU(inplace=True),  # PReLU or ReLU
+                nn.Dropout(dropout),  # Dropout
+            )
+        else:
+            self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,  # Conv1D
+                                   stride=stride, padding=padding, dilation=dilation)
+            self.batchnorm1 = nn.BatchNorm1d(n_outputs)  # BatchNorm1D
+            self.chomp1 = Chomp1d(padding,symm_chomp)  if not self.no_padding else None  # Casual Conv1D or None
+            if relu_type == 'relu':
+                self.relu1 = nn.ReLU()  # ReLU
+            elif relu_type == 'prelu':
+                self.relu1 = nn.PReLU(num_parameters=n_outputs)  # PReLU
+            self.dropout1 = nn.Dropout(dropout)  # Dropout
+            self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size,  # Conv1D
+                                               stride=stride, padding=padding, dilation=dilation)
+            self.batchnorm2 = nn.BatchNorm1d(n_outputs)  # BatchNorm1D
+            self.chomp2 = Chomp1d(padding,symm_chomp) if not self.no_padding else None  # Casual Conv1D or None
+            if relu_type == 'relu':
+                self.relu2 = nn.ReLU()  # ReLU
+            elif relu_type == 'prelu':
+                self.relu2 = nn.PReLU(num_parameters=n_outputs)  # PReLU
+            self.dropout2 = nn.Dropout(dropout)  # Dropout
+            if self.no_padding:
+                self.net = nn.Sequential(self.conv1, self.batchnorm1, self.relu1, self.dropout1,
+                                         self.conv2, self.batchnorm2, self.relu2, self.dropout2)
+            else:
+                self.net = nn.Sequential(self.conv1, self.batchnorm1, self.chomp1, self.relu1, self.dropout1,
+                                         self.conv2, self.batchnorm2, self.chomp2, self.relu2, self.dropout2)
+        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None  # Conv1D or None
+        if self.no_padding:
+            self.downsample_chomp = Chomp1d(downsample_chomp_size,True)  # Casual Conv1D
+        if relu_type == 'relu':
+            self.relu = nn.ReLU()  # ReLU
+        elif relu_type == 'prelu':
+            self.relu = nn.PReLU(num_parameters=n_outputs)  # PReLU
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        out = self.net(x)
+        if self.no_padding:
+            x = self.downsample_chomp(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+# TCN 모델
+class TemporalConvNet(nn.Module):
+    def __init__(self, num_inputs, num_channels, tcn_options, dropout=0.2, relu_type='relu', dwpw=False):
+        super(TemporalConvNet, self).__init__()
+        self.ksize = tcn_options['kernel_size'][0] if isinstance(tcn_options['kernel_size'], list) else tcn_options['kernel_size']
+        layers = []
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2 ** i
+            in_channels = num_inputs if i == 0 else num_channels[i-1]
+            out_channels = num_channels[i]
+            layers.append( TemporalBlock(in_channels, out_channels, self.ksize, stride=1, dilation=dilation_size,
+                                     padding=(self.ksize-1) * dilation_size, dropout=dropout, symm_chomp = True,
+                                     no_padding = False, relu_type=relu_type, dwpw=dwpw) )
+        self.network = nn.Sequential(*layers)  # 설정한 레이어 반환
+    # 모델이 학습데이터를 입력받아서 forward propagation 진행
+    def forward(self, x):
+        return self.network(x)
+# --------------------------------

lipreading/optim_utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import math
+import torch
+import torch.optim as optim
+def change_lr_on_optimizer(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+class CosineScheduler:
+    def __init__(self, lr_ori, epochs):
+        self.lr_ori = lr_ori
+        self.epochs = epochs
+    def adjust_lr(self, optimizer, epoch):
+        reduction_ratio = 0.5 * (1 + math.cos(math.pi * epoch / self.epochs))
+        change_lr_on_optimizer(optimizer, self.lr_ori*reduction_ratio)
+def get_optimizer(args, optim_policies):
+    # -- define optimizer
+    if args.optimizer == 'adam':
+        optimizer = optim.Adam(optim_policies, lr=args.lr, weight_decay=1e-4)
+    elif args.optimizer == 'adamw':
+        optimizer = optim.AdamW(optim_policies, lr=args.lr, weight_decay=1e-2)
+    elif args.optimizer == 'sgd':
+        optimizer = optim.SGD(optim_policies, lr=args.lr, weight_decay=1e-4, momentum=0.9)
+    else:
+        raise NotImplementedError
+    return optimizer

lipreading/preprocess.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import cv2
+import random
+import numpy as np
+__all__ = ['Compose', 'Normalize', 'CenterCrop', 'RgbToGray', 'RandomCrop',
+           'HorizontalFlip', 'AddNoise', 'NormalizeUtterance']
+class Compose(object):
+    """Compose several preprocess together.
+    Args:
+        preprocess (list of ``Preprocess`` objects): list of preprocess to compose.
+    """
+    # preprecess ([preprocess]) : dataloaders.py에서 사용됨
+    # preprocessing['train'] = Compose([
+    #                                 Normalize( 0.0,255.0 ),
+    #                                 RandomCrop(crop_size),
+    #                                 HorizontalFlip(0.5),
+    #                                 Normalize(mean, std) ])
+    def __init__(self, preprocess):
+        self.preprocess = preprocess
+    def __call__(self, sample):
+        for t in self.preprocess:
+            sample = t(sample)
+        return sample   # preprocess에 담긴 각 augmentation 전처리가 sample에 담겨 반환된다.
+    def __repr__(self):   # __repr__() : 괄호 안에 있는 것을 문자열로 반환
+        format_string = self.__class__.__name__ + '('
+        for t in self.preprocess:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string   # 클래스명, 전처리명 등을 괄호 안에 출력
+class RgbToGray(object):
+    """Convert image to grayscale.
+    Converts a numpy.ndarray (H x W x C) in the range
+    [0, 255] to a numpy.ndarray of shape (H x W x C) in the range [0.0, 1.0].
+    """
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Image to be converted to gray.
+        Returns:
+            numpy.ndarray: grey image
+        """
+        frames = np.stack([cv2.cvtColor(_, cv2.COLOR_RGB2GRAY) for _ in frames], axis=0)
+        return frames
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+class Normalize(object):
+    """Normalize a ndarray image with mean and standard deviation.
+    """
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+    def __call__(self, frames):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        frames = (frames - self.mean) / self.std   # 편차를 표준 편차로 나눈 값 : z-score normalization
+        return frames
+    def __repr__(self):
+        return self.__class__.__name__+'(mean={0}, std={1})'.format(self.mean, self.std)
+class CenterCrop(object):
+    """Crop the given image at the center
+    """
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Images to be cropped.
+        Returns:
+            numpy.ndarray: Cropped image.
+        """
+        t, h, w = frames.shape
+        th, tw = self.size   # 자르려고 지정한 높이와 넓이 사이즈
+        delta_w = int(round((w - tw))/2.)
+        delta_h = int(round((h - th))/2.)
+        frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
+        return frames  # center crop된 이미지 반환 (np.array)
+class RandomCrop(object):
+    """Crop the given image at the center
+    """
+    def __init__(self, size):
+        self.size = size
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Images to be cropped.
+        Returns:
+            numpy.ndarray: Cropped image.
+        """
+        t, h, w = frames.shape  # size: 96,96
+        th, tw = self.size
+        delta_w = random.randint(0, w-tw)
+        delta_h = random.randint(0, h-th)
+        frames = frames[:, delta_h:delta_h+th, delta_w:delta_w+tw]
+        return frames   # random crop된 이미지 반환 (np.array)
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)   # random crop된 사이즈를 반환
+class HorizontalFlip(object):   # HorizontalFlip(비율값 입)
+    """Flip image horizontally.
+    """
+    def __init__(self, flip_ratio):
+        self.flip_ratio = flip_ratio
+    def __call__(self, frames):
+        """
+        Args:
+            img (numpy.ndarray): Images to be flipped with a probability flip_ratio
+        Returns:
+            numpy.ndarray: Cropped image.
+        """
+        t, h, w = frames.shape
+        if random.random() < self.flip_ratio:
+            for index in range(t):
+                frames[index] = cv2.flip(frames[index], 1)
+        return frames
+class NormalizeUtterance():
+    """Normalize per raw audio by removing the mean and divided by the standard deviation
+    """
+    # z-score 정규화를 실행
+    def __call__(self, signal):
+        signal_std = 0. if np.std(signal)==0. else np.std(signal)
+        signal_mean = np.mean(signal)
+        return (signal - signal_mean) / signal_std
+class AddNoise(object):
+    """Add SNR noise [-1, 1]
+    """
+    # snr(signal-to-noise ratio) : 신호 대 잡음 비, 이 값이 클수록
+    def __init__(self, noise, snr_levels=[-5, 0, 5, 10, 15, 20, 9999]):
+        assert noise.dtype in [np.float32, np.float64], "noise only supports float data type"   # noise는 dtype만 지원한다.
+        self.noise = noise
+        self.snr_levels = snr_levels
+    def get_power(self, clip):
+        clip2 = clip.copy()
+        clip2 = clip2 **2
+        return np.sum(clip2) / (len(clip2) * 1.0)
+    def __call__(self, signal):
+        assert signal.dtype in [np.float32, np.float64], "signal only supports float32 data type"   # signal은 dtype만 지원한다.
+        snr_target = random.choice(self.snr_levels)
+        if snr_target == 9999:
+            return signal
+        else:
+            # -- get noise
+            start_idx = random.randint(0, len(self.noise)-len(signal))
+            noise_clip = self.noise[start_idx:start_idx+len(signal)]
+            sig_power = self.get_power(signal)
+            noise_clip_power = self.get_power(noise_clip)
+            factor = (sig_power / noise_clip_power ) / (10**(snr_target / 10.0))
+            desired_signal = (signal + noise_clip*np.sqrt(factor)).astype(np.float32)
+            return desired_signal

lipreading/utils.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import json
+import numpy as np
+import datetime
+import logging
+import json
+import torch
+import shutil
+def calculateNorm2(model):
+    para_norm = 0.
+    for p in model.parameters():
+        para_norm += p.data.norm(2)
+    print('2-norm of the neural network: {:.4f}'.format(para_norm**.5))
+def showLR(optimizer):
+    return optimizer.param_groups[0]['lr']
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+# -- IO utils
+def read_txt_lines(filepath):
+    assert os.path.isfile( filepath ), "Error when trying to read txt file, path does not exist: {}".format(filepath)
+    with open( filepath ) as myfile:
+        content = myfile.read().splitlines()
+    return content
+def save_as_json(d, filepath):
+    with open(filepath, 'w') as outfile:
+        json.dump(d, outfile, indent=4, sort_keys=True)
+def load_json( json_fp ):
+    assert os.path.isfile( json_fp ), "Error loading JSON. File provided does not exist, cannot read: {}".format( json_fp )
+    with open( json_fp, 'r' ) as f:
+        json_content = json.load(f)
+    return json_content
+def save2npz(filename, data=None):
+    assert data is not None, "data is {}".format(data)
+    if not os.path.exists(os.path.dirname(filename)):
+        os.makedirs(os.path.dirname(filename))
+    np.savez_compressed(filename, data=data)
+# -- checkpoints
+class CheckpointSaver:
+    def __init__(self, save_dir, checkpoint_fn='ckpt.pth.tar', best_fn='ckpt.best.pth.tar', best_step_fn='ckpt.best.step{}.pth.tar', save_best_step=False, lr_steps=[]):
+        """
+        Only mandatory: save_dir
+            Can configure naming of checkpoint files through checkpoint_fn, best_fn and best_stage_fn
+            If you want to keep best-performing checkpoint per step
+        """
+        self.save_dir = save_dir
+        # checkpoint names
+        self.checkpoint_fn = checkpoint_fn
+        self.best_fn = best_fn
+        self.best_step_fn = best_step_fn
+        # save best per step?
+        self.save_best_step = save_best_step
+        self.lr_steps = []
+        # init var to keep track of best performing checkpoint
+        self.current_best = 0
+        # save best at each step?
+        if self.save_best_step:
+            assert lr_steps != [], "Since save_best_step=True, need proper value for lr_steps. Current: {}".format(lr_steps)
+            self.best_for_stage = [0]*(len(lr_steps)+1)
+    def save(self, save_dict, current_perf, epoch=-1):
+        """
+            Save checkpoint and keeps copy if current perf is best overall or [optional] best for current LR step
+        """
+        # save last checkpoint
+        checkpoint_fp = os.path.join(self.save_dir, self.checkpoint_fn)
+        # keep track of best model
+        self.is_best = current_perf > self.current_best
+        if self.is_best:
+            self.current_best = current_perf
+            best_fp = os.path.join(self.save_dir, self.best_fn)
+        save_dict['best_prec'] = self.current_best
+        # keep track of best-performing model per step [optional]
+        if self.save_best_step:
+            assert epoch >= 0, "Since save_best_step=True, need proper value for 'epoch'. Current: {}".format(epoch)
+            s_idx = sum( epoch >= l for l in lr_steps )
+            self.is_best_for_stage = current_perf > self.best_for_stage[s_idx]
+            if self.is_best_for_stage:
+                self.best_for_stage[s_idx] = current_perf
+                best_stage_fp = os.path.join(self.save_dir, self.best_stage_fn.format(s_idx))
+            save_dict['best_prec_per_stage'] = self.best_for_stage
+        # save
+        torch.save(save_dict, checkpoint_fp)
+        print("Checkpoint saved at {}".format(checkpoint_fp))
+        if self.is_best:
+            shutil.copyfile(checkpoint_fp, best_fp)
+        if self.save_best_step and self.is_best_for_stage:
+            shutil.copyfile(checkpoint_fp, best_stage_fp)
+    def set_best_from_ckpt(self, ckpt_dict):
+        self.current_best = ckpt_dict['best_prec']
+        self.best_for_stage = ckpt_dict.get('best_prec_per_stage',None)
+def load_model(load_path, model, optimizer = None, allow_size_mismatch = False):
+    """
+    Load model from file
+    If optimizer is passed, then the loaded dictionary is expected to contain also the states of the optimizer.
+    If optimizer not passed, only the model weights will be loaded
+    """
+    # -- load dictionary
+    assert os.path.isfile( load_path ), "Error when loading the model, provided path not found: {}".format( load_path )
+    checkpoint = torch.load(load_path)
+    loaded_state_dict = checkpoint['model_state_dict']
+    if allow_size_mismatch:
+        loaded_sizes = { k: v.shape for k,v in loaded_state_dict.items() }
+        model_state_dict = model.state_dict()
+        model_sizes = { k: v.shape for k,v in model_state_dict.items() }
+        mismatched_params = []
+        for k in loaded_sizes:
+            if loaded_sizes[k] != model_sizes[k]:
+                mismatched_params.append(k)
+        for k in mismatched_params:
+            del loaded_state_dict[k]
+    # -- copy loaded state into current model and, optionally, optimizer
+    model.load_state_dict(loaded_state_dict, strict = not allow_size_mismatch)
+    if optimizer is not None:
+        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        return model, optimizer, checkpoint['epoch_idx'], checkpoint
+    return model
+# -- logging utils
+def get_logger(args,save_path):
+    log_path = '{}/{}_{}_{}classes_log.txt'.format(save_path,args.training_mode,args.lr,args.num_classes)
+    logger = logging.getLogger("mylog")
+    logger.setLevel(logging.INFO)
+    fh = logging.FileHandler(log_path)
+    fh.setLevel(logging.INFO)
+    logger.addHandler(fh)
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    logger.addHandler(console)
+    return logger
+def update_logger_batch( args, logger, dset_loader, batch_idx, running_loss, running_corrects, running_all, batch_time, data_time ):
+    perc_epoch = 100. * batch_idx / (len(dset_loader)-1)
+    logger.info('[{:5.0f}/{:5.0f} ({:.0f}%)]\tLoss: {:.4f}\tAcc:{:.4f}\tCost time:{:1.3f} ({:1.3f})s\tData time:{:1.3f} ({:1.3f})\tInstances per second: {:.2f}'.format(
+        running_all,
+        len(dset_loader.dataset),
+        perc_epoch,
+        running_loss / running_all,
+        running_corrects / running_all,
+        batch_time.val, batch_time.avg,
+        data_time.val, data_time.avg,
+        args.batch_size/batch_time.avg ))
+def get_save_folder( args):
+    # create save and log folder
+    save_path = '{}/{}'.format( args.logging_dir, args.training_mode )
+    save_path += '/' + datetime.datetime.now().isoformat().split('.')[0]
+    if not os.path.isdir(save_path):
+        os.makedirs(save_path)
+    return save_path