feifeifeiliu commited on
Commit
9b6fdf7
·
1 Parent(s): 865fd8a

Upload 25 files

Browse files
Files changed (2) hide show
  1. data_utils/utils.py +16 -37
  2. visualise/rendering.py +1 -1
data_utils/utils.py CHANGED
@@ -1,6 +1,5 @@
1
  import numpy as np
2
  # import librosa #has to do this cause librosa is not supported on my server
3
- import python_speech_features
4
  from scipy.io import wavfile
5
  from scipy import signal
6
  import librosa
@@ -79,9 +78,9 @@ def get_melspec(audio_fn, eps=1e-6, fps = 25, sr=16000, n_mels=64):
79
  '''
80
 
81
  def extract_mfcc(audio,sample_rate=16000):
82
- mfcc = zip(*python_speech_features.mfcc(audio,sample_rate, numcep=64, nfilt=64, nfft=2048, winstep=0.04))
83
- mfcc = np.stack([np.array(i) for i in mfcc])
84
- return mfcc
85
 
86
  def get_mfcc_psf(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
87
  y, sr = load_wav_old(audio_fn, sr=sr)
@@ -97,14 +96,14 @@ def get_mfcc_psf(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, w
97
  n_fft=2048
98
 
99
  #hard coded for 25 fps
100
- if not smlpx:
101
- C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=0.04)
102
- else:
103
- C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01/15)
104
  # if C.shape[0] == n_mfcc:
105
  # C = C.transpose(1, 0)
106
 
107
- return C
108
 
109
 
110
  def get_mfcc_psf_min(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
@@ -119,13 +118,13 @@ def get_mfcc_psf_min(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=6
119
 
120
  C = []
121
 
122
- for i in range(slice):
123
- if i != (slice - 1):
124
- feat = python_speech_features.mfcc(y[i*slice_len:(i+1)*slice_len], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15)
125
- else:
126
- feat = python_speech_features.mfcc(y[i * slice_len:], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15)
127
-
128
- C.append(feat)
129
 
130
  return C
131
 
@@ -310,24 +309,4 @@ def smooth_geom(geom, mask: torch.Tensor = None, filter_size: int = 9, sigma: fl
310
  if mask is None:
311
  return smoothed
312
  else:
313
- return smoothed * mask[None, :, None] + geom * (-mask[None, :, None] + 1)
314
-
315
- if __name__ == '__main__':
316
- audio_fn = '../sample_audio/clip000028_tCAkv4ggPgI.wav'
317
-
318
- C = get_mfcc_psf(audio_fn)
319
- print(C.shape)
320
-
321
- C_2 = get_mfcc_librosa(audio_fn)
322
- print(C.shape)
323
-
324
- print(C)
325
- print(C_2)
326
- print((C == C_2).all())
327
- # print(y.shape, sr)
328
- # mel_spec = get_melspec(audio_fn)
329
- # print(mel_spec.shape)
330
- # mfcc = get_mfcc(audio_fn, sr = 16000)
331
- # print(mfcc.shape)
332
- # print(mel_spec.max(), mel_spec.min())
333
- # print(mfcc.max(), mfcc.min())
 
1
  import numpy as np
2
  # import librosa #has to do this cause librosa is not supported on my server
 
3
  from scipy.io import wavfile
4
  from scipy import signal
5
  import librosa
 
78
  '''
79
 
80
  def extract_mfcc(audio,sample_rate=16000):
81
+ # mfcc = zip(*python_speech_features.mfcc(audio,sample_rate, numcep=64, nfilt=64, nfft=2048, winstep=0.04))
82
+ # mfcc = np.stack([np.array(i) for i in mfcc])
83
+ return None
84
 
85
  def get_mfcc_psf(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
86
  y, sr = load_wav_old(audio_fn, sr=sr)
 
96
  n_fft=2048
97
 
98
  #hard coded for 25 fps
99
+ # if not smlpx:
100
+ # C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=0.04)
101
+ # else:
102
+ # C = python_speech_features.mfcc(y, sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01/15)
103
  # if C.shape[0] == n_mfcc:
104
  # C = C.transpose(1, 0)
105
 
106
+ return None
107
 
108
 
109
  def get_mfcc_psf_min(audio_fn, eps=1e-6, fps=25, smlpx=False, sr=16000, n_mfcc=64, win_size=None):
 
118
 
119
  C = []
120
 
121
+ # for i in range(slice):
122
+ # if i != (slice - 1):
123
+ # feat = python_speech_features.mfcc(y[i*slice_len:(i+1)*slice_len], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15)
124
+ # else:
125
+ # feat = python_speech_features.mfcc(y[i * slice_len:], sr, numcep=n_mfcc, nfilt=n_mfcc, nfft=n_fft, winstep=1.01 / 15)
126
+ #
127
+ # C.append(feat)
128
 
129
  return C
130
 
 
309
  if mask is None:
310
  return smoothed
311
  else:
312
+ return smoothed * mask[None, :, None] + geom * (-mask[None, :, None] + 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
visualise/rendering.py CHANGED
@@ -226,7 +226,7 @@ class RenderTool():
226
  writer.write(final_img)
227
  writer.release()
228
 
229
- cmd = ('ffmpeg' + ' -i {0} -i {1} -vcodec h264 -ac 2 -channel_layout stereo -pix_fmt yuv420p result.mp4'.format(
230
  tmp_audio_file.name, tmp_video_file.name)).split()
231
  # cmd = ('ffmpeg' + '-i {0} -vcodec h264 -ac 2 -channel_layout stereo -pix_fmt yuv420p {1}'.format(
232
  # tmp_video_file.name, video_fname)).split()
 
226
  writer.write(final_img)
227
  writer.release()
228
 
229
+ cmd = ('ffmpeg' + ' -y -i {0} -i {1} -vcodec h264 -ac 2 -channel_layout stereo -pix_fmt yuv420p result.mp4'.format(
230
  tmp_audio_file.name, tmp_video_file.name)).split()
231
  # cmd = ('ffmpeg' + '-i {0} -vcodec h264 -ac 2 -channel_layout stereo -pix_fmt yuv420p {1}'.format(
232
  # tmp_video_file.name, video_fname)).split()