import torch | |
from resemblyzer import VoiceEncoder | |
class ResemblyzerVoiceEncoder: | |
def __init__(self, device) -> None: | |
self.model = VoiceEncoder(device) | |
def __call__(self, audio: torch.Tensor): | |
if audio.ndimension() == 1: | |
return torch.tensor(self.model.embed_utterance(audio.numpy())).float().cpu() | |
else: | |
e = torch.stack([torch.tensor(self.model.embed_utterance(audio[i,:].numpy())).float().cpu() | |
for i in range(audio.shape[0])]) | |
return e | |