Xornotor's picture
0.2.0-beta - fixed MIDI instrument
9d0584c
import os
import math
import mido
import pumpp
import librosa
import numpy as np
import pandas as pd
from copy import deepcopy
from scipy.ndimage import gaussian_filter1d
from cqfe_models import mask_voas_cnn_v2_model, late_deep_cnn_model
SATB_THRESHOLDS = [0.23, 0.17, 0.15, 0.17]
############################################################
freqscale = librosa.cqt_frequencies(n_bins=360, fmin=32.7, bins_per_octave=60)
def bin_to_freq(bin):
return freqscale[bin]
vec_bin_to_freq = np.vectorize(bin_to_freq)
############################################################
def downsample_bins(voice):
voice_0 = np.array(voice.T[0::5]).T
voice_1 = np.array(voice.T[1::5]).T
voice_2 = np.array(voice.T[2::5]).T
voice_3 = np.array(voice.T[3::5]).T
voice_4 = np.array(voice.T[4::5]).T
voice_0 = voice_0.T[1:70].T
voice_1 = voice_1.T[1:70].T
voice_2 = voice_2.T[1:70].T
voice_3 = voice_3.T[0:69].T
voice_4 = voice_4.T[0:69].T
voice_sums = voice_0 + voice_1 + voice_2 + voice_3 + voice_4
voice_argmax = np.argmax(voice_sums, axis=1)
threshold = np.zeros(voice_sums.shape)
threshold[np.arange(voice_argmax.size), voice_argmax] = 1
threshold[:, 0] = 0
voice_sums = threshold
return voice_sums
############################################################
def bin_matrix_to_freq(matrix):
s_freqs = vec_bin_to_freq(np.argmax(matrix[0], axis=0)).reshape(-1, 1)
a_freqs = vec_bin_to_freq(np.argmax(matrix[1], axis=0)).reshape(-1, 1)
t_freqs = vec_bin_to_freq(np.argmax(matrix[2], axis=0)).reshape(-1, 1)
b_freqs = vec_bin_to_freq(np.argmax(matrix[3], axis=0)).reshape(-1, 1)
freqs = np.concatenate((s_freqs, a_freqs, t_freqs, b_freqs), axis=1).T
return freqs
############################################################
def create_midi(freq, write_path='./midi_track.mid', ticks_per_beat=58,
tempo=90, save_to_file=True, program=53, channel=0):
def freq_to_list(freq):
# List event = (pitch, velocity, time)
T = freq.shape[0]
#midi_freqs = np.squeeze(midi_freqs)
midi_freqs = np.round(69 + 12*np.log2(freq/440)).squeeze().astype('int')
t_last = 0
pitch_tm1 = 20
list_event = []
for t in range(T):
pitch_t = midi_freqs[t]
if (pitch_t != pitch_tm1):
velocity = 127
if(pitch_t == 24):
pitch_t = 0
velocity = 0
t_event = t - t_last
t_last = t
list_event.append((pitch_tm1, 0, t_event))
list_event.append((pitch_t, velocity, 0))
pitch_tm1 = pitch_t
list_event.append((pitch_tm1, 0, T - t_last))
return list_event
# Tempo
microseconds_per_beat = mido.bpm2tempo(tempo)
# Write a pianoroll in a midi file
mid = mido.MidiFile()
mid.ticks_per_beat = ticks_per_beat
# Add a new track with the instrument name to the midi file
track = mid.add_track("Voice Aah")
# transform the matrix in a list of (pitch, velocity, time)
events = freq_to_list(freq)
#print(events)
# Tempo
track.append(mido.MetaMessage('set_tempo', tempo=microseconds_per_beat))
track.append(mido.MetaMessage('channel_prefix', channel=channel))
# Add the program_change
#Choir Aahs = 53, Voice Oohs (or Doos) = 54, Synch Choir = 55
track.append(mido.Message('program_change', program=program, channel=channel))
# This list is required to shut down
# notes that are on, intensity modified, then off only 1 time
# Example :
# (60,20,0)
# (60,40,10)
# (60,0,15)
notes_on_list = []
# Write events in the midi file
for event in events:
pitch, velocity, time = event
if velocity == 0:
# Get the channel
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
if(pitch in notes_on_list):
notes_on_list.remove(pitch)
else:
if pitch in notes_on_list:
track.append(mido.Message('note_off', note=pitch, velocity=0, time=time, channel=channel))
notes_on_list.remove(pitch)
time = 0
track.append(mido.Message('note_on', note=pitch, velocity=velocity, time=time, channel=channel))
notes_on_list.append(pitch)
if save_to_file:
mid.save(write_path)
return mid
############################################################
def song_to_midi(sop, alto, ten, bass):
savepath = './output.mid'
bin_matrix = np.array([sop, alto, ten, bass])
freq_matrix = bin_matrix_to_freq(bin_matrix)
mid_sop = create_midi(freq_matrix[0], save_to_file=False, program=52, channel=0)
mid_alto = create_midi(freq_matrix[1], save_to_file=False, program=52, channel=1)
mid_ten = create_midi(freq_matrix[2], save_to_file=False, program=52, channel=2)
mid_bass = create_midi(freq_matrix[3], save_to_file=False, program=52, channel=3)
mid_mix = mido.MidiFile()
mid_mix.ticks_per_beat=mid_sop.ticks_per_beat
mid_mix.tracks = mid_sop.tracks + mid_alto.tracks + mid_ten.tracks + mid_bass.tracks
mid_mix.save(savepath)
return savepath
############################################################
def song_to_dataframe(sop, alto, ten, bass):
timescale = np.arange(0, 0.011609977 * (sop.shape[1]), 0.011609977)[:sop.shape[1]]
s_argmax = vec_bin_to_freq(np.argmax(sop, axis=0))
a_argmax = vec_bin_to_freq(np.argmax(alto, axis=0))
t_argmax = vec_bin_to_freq(np.argmax(ten, axis=0))
b_argmax = vec_bin_to_freq(np.argmax(bass, axis=0))
data = np.array([timescale, s_argmax, a_argmax, t_argmax, b_argmax], dtype=np.float32).T
columns = ['Timestep', 'Soprano', 'Alto', 'Tenor', 'Bass']
df = pd.DataFrame(data, columns=columns)
return df
############################################################
def prediction_postproc(input_array, argmax_and_threshold=True,
gaussian_blur=True,
threshold_value=0):
prediction = np.moveaxis(input_array, 0, 1).reshape(360, -1)
thres_reference = deepcopy(prediction)
if(argmax_and_threshold):
prediction = np.argmax(prediction, axis=0)
prediction = np.array([prediction[i] if thres_reference[prediction[i], i] >= threshold_value else 0 for i in np.arange(prediction.size)])
threshold = np.zeros((360, prediction.shape[0]))
threshold[prediction, np.arange(prediction.size)] = 1
prediction = threshold
if(gaussian_blur):
prediction = np.array(gaussian_filter1d(prediction, 1, axis=0, mode='wrap'))
prediction = (prediction - np.min(prediction))/(np.max(prediction)-np.min(prediction))
return prediction
############################################################
def get_hcqt_params():
bins_per_octave = 60
n_octaves = 6
over_sample = 5
harmonics = [1, 2, 3, 4, 5]
sr = 22050
fmin = 32.7
hop_length = 256
return bins_per_octave, n_octaves, harmonics, sr, fmin, hop_length, over_sample
############################################################
def create_pump_object():
(bins_per_octave, n_octaves, harmonics,
sr, f_min, hop_length, over_sample) = get_hcqt_params()
p_phdif = pumpp.feature.HCQTPhaseDiff(name='dphase', sr=sr, hop_length=hop_length,
fmin=f_min, n_octaves=n_octaves, over_sample=over_sample, harmonics=harmonics, log=True)
pump = pumpp.Pump(p_phdif)
return pump
############################################################
def compute_pump_features(pump, audio_fpath):
data = pump(audio_f=audio_fpath)
return data
############################################################
def get_mpe_prediction(model, audio_file=None):
"""Generate output from a model given an input numpy file.
Part of this function is part of deepsalience
"""
split_value = 4000
if audio_file is not None:
pump = create_pump_object()
features = compute_pump_features(pump, audio_file)
input_hcqt = features['dphase/mag'][0]
input_dphase = features['dphase/dphase'][0]
else:
raise ValueError("One audio_file must be specified")
input_hcqt = input_hcqt.transpose(1, 2, 0)[np.newaxis, :, :, :]
input_dphase = input_dphase.transpose(1, 2, 0)[np.newaxis, :, :, :]
n_t = input_hcqt.shape[3]
t_slices = list(np.arange(0, n_t, split_value))
output_list = []
for t in t_slices:
p = model.predict([np.transpose(input_hcqt[:, :, :, t:t+split_value], (0, 1, 3, 2)),
np.transpose(input_dphase[:, :, :, t:t+split_value], (0, 1, 3, 2))]
)[0, :, :]
output_list.append(p)
predicted_output = np.hstack(output_list).astype(np.float32)
return predicted_output
############################################################
def get_va_prediction(model, f0_matrix):
splits = f0_matrix.shape[1]//256
splits_diff = 256 - (f0_matrix.shape[1] - splits * 256)
fill = np.zeros((360, splits_diff))
mix_filled = np.concatenate((np.copy(f0_matrix), fill), axis=1)
mix_filled = np.reshape(mix_filled, (360, -1, 256, 1)).transpose((1, 0, 2, 3))
batches = math.ceil(mix_filled.shape[0]/24)
s_pred_result = np.zeros((0, 360, 256))
a_pred_result = np.zeros((0, 360, 256))
t_pred_result = np.zeros((0, 360, 256))
b_pred_result = np.zeros((0, 360, 256))
for i in range(batches):
s_pred, a_pred, t_pred, b_pred = model.predict(mix_filled[i*24:(i+1)*24])
s_pred_result = np.append(s_pred_result, s_pred, axis=0)
a_pred_result = np.append(a_pred_result, a_pred, axis=0)
t_pred_result = np.append(t_pred_result, t_pred, axis=0)
b_pred_result = np.append(b_pred_result, b_pred, axis=0)
s_pred_result = prediction_postproc(s_pred_result, threshold_value=SATB_THRESHOLDS[0])[:, :f0_matrix.shape[1]]
a_pred_result = prediction_postproc(a_pred_result, threshold_value=SATB_THRESHOLDS[1])[:, :f0_matrix.shape[1]]
t_pred_result = prediction_postproc(t_pred_result, threshold_value=SATB_THRESHOLDS[2])[:, :f0_matrix.shape[1]]
b_pred_result = prediction_postproc(b_pred_result, threshold_value=SATB_THRESHOLDS[3])[:, :f0_matrix.shape[1]]
return s_pred_result, a_pred_result, t_pred_result, b_pred_result
############################################################
def cqfe(audiofile, mpe=late_deep_cnn_model(), va=mask_voas_cnn_v2_model()):
savepath_csv = './output.csv'
savepath_hdf5 = './output.hdf5'
mpe_pred = get_mpe_prediction(mpe, audiofile)
s_pred, a_pred, t_pred, b_pred = get_va_prediction(va, mpe_pred)
output_midi = song_to_midi(s_pred, a_pred, t_pred, b_pred)
output_df = song_to_dataframe(s_pred, a_pred, t_pred, b_pred)
output_df.to_csv(savepath_csv, mode='w', header=True)
output_df.to_hdf(savepath_hdf5, key='F0', mode='w', complevel=9, complib='blosc', append=False, format='table')
ax1 = output_df.plot.scatter(x='Timestep', y='Bass', s=1, color='#2f29e3', label='Bass')
ax2 = output_df.plot.scatter(x='Timestep', y='Tenor', s=1, color='#e36129', label='Tenor', ax=ax1)
ax3 = output_df.plot.scatter(x='Timestep', y='Alto', s=1, color='#29e35a', label='Alto', ax=ax1)
ax4 = output_df.plot.scatter(x='Timestep', y='Soprano', s=1, color='#d3d921', label='Soprano', ax=ax1)
ax1.set_xlabel('Time (s)')
ax1.set_ylabel('Freq (Hz)')
fig = ax1.get_figure()
fig.set_dpi(150)
return [output_midi, savepath_csv, savepath_hdf5], fig
############################################################