Spaces:

shivammehta25
/

Diff-TTSG

Runtime error

App Files Files Community

Shivam Mehta commited on Jun 27, 2023

Commit

3c10b34

1 Parent(s): f5a235a

Adding code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Makefile +30 -0
app.py +253 -0
diff_ttsg/__init__.py +0 -0
diff_ttsg/__pycache__/__init__.cpython-310.pyc +0 -0
diff_ttsg/data/__init__.py +0 -0
diff_ttsg/data/__pycache__/__init__.cpython-310.pyc +0 -0
diff_ttsg/data/__pycache__/cormac_datamodule.cpython-310.pyc +0 -0
diff_ttsg/data/components/__init__.py +0 -0
diff_ttsg/data/cormac_datamodule.py +214 -0
diff_ttsg/data/mnist_datamodule.py +130 -0
diff_ttsg/eval.py +93 -0
diff_ttsg/hifigan/LICENSE +21 -0
diff_ttsg/hifigan/README.md +105 -0
diff_ttsg/hifigan/__init__.py +0 -0
diff_ttsg/hifigan/__pycache__/__init__.cpython-310.pyc +0 -0
diff_ttsg/hifigan/__pycache__/config.cpython-310.pyc +0 -0
diff_ttsg/hifigan/__pycache__/denoiser.cpython-310.pyc +0 -0
diff_ttsg/hifigan/__pycache__/env.cpython-310.pyc +0 -0
diff_ttsg/hifigan/__pycache__/models.cpython-310.pyc +0 -0
diff_ttsg/hifigan/__pycache__/xutils.cpython-310.pyc +0 -0
diff_ttsg/hifigan/config.py +38 -0
diff_ttsg/hifigan/denoiser.py +64 -0
diff_ttsg/hifigan/env.py +17 -0
diff_ttsg/hifigan/meldataset.py +171 -0
diff_ttsg/hifigan/models.py +286 -0
diff_ttsg/hifigan/xutils.py +60 -0
diff_ttsg/models/__init__.py +0 -0
diff_ttsg/models/__pycache__/__init__.cpython-310.pyc +0 -0
diff_ttsg/models/__pycache__/diff_ttsg.cpython-310.pyc +0 -0
diff_ttsg/models/components/__init__.py +0 -0
diff_ttsg/models/components/__pycache__/__init__.cpython-310.pyc +0 -0
diff_ttsg/models/components/__pycache__/diffusion.cpython-310.pyc +0 -0
diff_ttsg/models/components/__pycache__/text_encoder.cpython-310.pyc +0 -0
diff_ttsg/models/components/__pycache__/transformer.cpython-310.pyc +0 -0
diff_ttsg/models/components/diffusion.py +376 -0
diff_ttsg/models/components/text_encoder.py +384 -0
diff_ttsg/models/components/transformer.py +250 -0
diff_ttsg/models/diff_ttsg.py +376 -0
diff_ttsg/models/mnist_module.py +137 -0
diff_ttsg/resources/cmu_dictionary +0 -0
diff_ttsg/text/LICENSE +30 -0
diff_ttsg/text/__init__.py +96 -0
diff_ttsg/text/__pycache__/__init__.cpython-310.pyc +0 -0
diff_ttsg/text/__pycache__/cleaners.cpython-310.pyc +0 -0
diff_ttsg/text/__pycache__/cmudict.cpython-310.pyc +0 -0
diff_ttsg/text/__pycache__/numbers.cpython-310.pyc +0 -0
diff_ttsg/text/__pycache__/symbols.cpython-310.pyc +0 -0
diff_ttsg/text/cleaners.py +73 -0
diff_ttsg/text/cmudict.py +60 -0
diff_ttsg/text/numbers.py +72 -0

Makefile ADDED Viewed

	@@ -0,0 +1,30 @@

+help:  ## Show help
+	@grep -E '^[.a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+clean: ## Clean autogenerated files
+	rm -rf dist
+	find . -type f -name "*.DS_Store" -ls -delete
+	find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf
+	find . | grep -E ".pytest_cache" | xargs rm -rf
+	find . | grep -E ".ipynb_checkpoints" | xargs rm -rf
+	rm -f .coverage
+clean-logs: ## Clean logs
+	rm -rf logs/**
+format: ## Run pre-commit hooks
+	pre-commit run -a
+sync: ## Merge changes from main branch to your current branch
+	git pull
+	git pull origin main
+test: ## Run not slow tests
+	pytest -k "not slow"
+test-full: ## Run all tests
+	pytest
+train: ## Train the model
+	python diff_ttsg/train.py run_name=dev

app.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import argparse
+import datetime as dt
+import warnings
+from pathlib import Path
+import ffmpeg
+import gradio as gr
+import IPython.display as ipd
+import joblib as jl
+import numpy as np
+import soundfile as sf
+import torch
+from tqdm.auto import tqdm
+from diff_ttsg.hifigan.config import v1
+from diff_ttsg.hifigan.denoiser import Denoiser
+from diff_ttsg.hifigan.env import AttrDict
+from diff_ttsg.hifigan.models import Generator as HiFiGAN
+from diff_ttsg.models.diff_ttsg import Diff_TTSG
+from diff_ttsg.text import cmudict, sequence_to_text, text_to_sequence
+from diff_ttsg.text.symbols import symbols
+from diff_ttsg.utils.model import denormalize
+from diff_ttsg.utils.utils import intersperse, plot_tensor
+from pymo.preprocessing import MocapParameterizer
+from pymo.viz_tools import render_mp4
+from pymo.writers import BVHWriter
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DIFF_TTSG_CHECKPOINT = "diff_ttsg_checkpoint.ckpt"
+HIFIGAN_CHECKPOINT = "g_02500000"
+MOTION_PIPELINE = "diff_ttsg/resources/data_pipe.expmap_86.1328125fps.sav"
+CMU_DICT_PATH = "diff_ttsg/resources/cmu_dictionary"
+OUTPUT_FOLDER = "synth_output"
+# Model loading tools
+def load_model(checkpoint_path):
+    model = Diff_TTSG.load_from_checkpoint(checkpoint_path, map_location=device)
+    model.eval()
+    return model
+# Vocoder loading tools
+def load_vocoder(checkpoint_path):
+    h = AttrDict(v1)
+    hifigan = HiFiGAN(h).to(device)
+    hifigan.load_state_dict(torch.load(checkpoint_path, map_location=device)['generator'])
+    _ = hifigan.eval()
+    hifigan.remove_weight_norm()
+    return hifigan
+# Setup text preprocessing
+cmu = cmudict.CMUDict(CMU_DICT_PATH)
+def process_text(text: str):
+    x = torch.LongTensor(intersperse(text_to_sequence(text, dictionary=cmu), len(symbols))).to(device)[None]
+    x_lengths = torch.LongTensor([x.shape[-1]]).cuda()
+    x_phones = sequence_to_text(x.squeeze(0).tolist())
+    return {
+        'x_orig': text,
+        'x': x,
+        'x_lengths': x_lengths,
+        'x_phones': x_phones
+    }
+# Setup motion visualisation
+motion_pipeline = jl.load(MOTION_PIPELINE)
+bvh_writer = BVHWriter()
+mocap_params = MocapParameterizer("position")
+## Load models
+model = load_model(DIFF_TTSG_CHECKPOINT)
+vocoder = load_vocoder(HIFIGAN_CHECKPOINT)
+denoiser = Denoiser(vocoder, mode='zeros')
+# Synthesis functions
+@torch.inference_mode()
+def synthesise(text, mel_timestep, motion_timestep, length_scale, mel_temp, motion_temp):
+    ## Number of timesteps to run the reverse denoising process
+    n_timesteps = {
+        'mel': mel_timestep,
+        'motion': motion_timestep,
+    }
+    ## Sampling temperature
+    temperature = {
+        'mel': mel_temp,
+        'motion': motion_temp
+    }
+    text_processed = process_text(text)
+    t = dt.datetime.now()
+    output = model.synthesise(
+        text_processed['x'],
+        text_processed['x_lengths'],
+        n_timesteps=n_timesteps,
+        temperature=temperature,
+        stoc=False,
+        spk=None,
+        length_scale=length_scale
+    )
+    t = (dt.datetime.now() - t).total_seconds()
+    print(f'RTF: {t * 22050 / (output["mel"].shape[-1] * 256)}')
+    output.update(text_processed) # merge everything to one dict
+    return output
+@torch.inference_mode()
+def to_waveform(mel, vocoder):
+    audio = vocoder(mel).clamp(-1, 1)
+    audio = denoiser(audio.squeeze(0)).cpu().squeeze()
+    return audio
+def to_bvh(motion):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        return motion_pipeline.inverse_transform([motion.cpu().squeeze(0).T])
+def save_to_folder(filename: str, output: dict, folder: str):
+    folder = Path(folder)
+    folder.mkdir(exist_ok=True, parents=True)
+    np.save(folder / f'{filename}', output['mel'].cpu().numpy())
+    sf.write(folder / f'{filename}.wav', output['waveform'], 22050, 'PCM_24')
+    with open(folder / f'{filename}.bvh', 'w') as f:
+        bvh_writer.write(output['bvh'], f)
+def to_stick_video(filename, bvh, folder):
+    folder = Path(folder)
+    folder.mkdir(exist_ok=True, parents=True)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        X_pos = mocap_params.fit_transform([bvh])
+    print(f"rendering {filename} ...")
+    render_mp4(X_pos[0], folder / f'{filename}.mp4', axis_scale=200)
+def combine_audio_video(filename: str, folder: str):
+    print("Combining audio and video")
+    folder = Path(folder)
+    folder.mkdir(exist_ok=True, parents=True)
+    input_video = ffmpeg.input(str(folder / f'{filename}.mp4'))
+    input_audio = ffmpeg.input(str(folder / f'{filename}.wav'))
+    output_filename = folder / f'{filename}_audio.mp4'
+    ffmpeg.concat(input_video, input_audio, v=1, a=1).output(str(output_filename)).run(overwrite_output=True)
+    print(f"Final output with audio: {output_filename}")
+def run(text, output, mel_timestep, motion_timestep, length_scale, mel_temp, motion_temp):
+    print("Running synthesis")
+    output = synthesise(text, mel_timestep, motion_timestep, length_scale, mel_temp, motion_temp)
+    output['waveform'] = to_waveform(output['mel'], vocoder)
+    output['bvh'] = to_bvh(output['motion'])[0]
+    save_to_folder('temp', output, OUTPUT_FOLDER)
+    return (
+        output,
+        output['x_phones'],
+        plot_tensor(output['mel'].squeeze().cpu().numpy()),
+        plot_tensor(output['motion'].squeeze().cpu().numpy()),
+        str(Path(OUTPUT_FOLDER) / f'temp.wav'),
+        gr.update(interactive=True)
+    )
+def visualize_it(output):
+    to_stick_video('temp', output['bvh'], OUTPUT_FOLDER)
+    combine_audio_video('temp', OUTPUT_FOLDER)
+    return str(Path(OUTPUT_FOLDER) / 'temp_audio.mp4')
+with gr.Blocks() as demo:
+    output = gr.State(value=None)
+    with gr.Row():
+        gr.Markdown("# Text Input")
+    with gr.Row():
+        text = gr.Textbox(label="Text Input")
+    with gr.Box():
+        with gr.Row():
+            gr.Markdown("### Hyper parameters")
+        with gr.Row():
+            mel_timestep = gr.Slider(label="Number of timesteps (mel)", minimum=0, maximum=1000, step=1, value=50, interactive=True)
+            motion_timestep = gr.Slider(label="Number of timesteps (motion)", minimum=0, maximum=1000, step=1, value=500, interactive=True)
+            length_scale = gr.Slider(label="Length scale (Speaking rate)", minimum=0.01, maximum=3.0, step=0.05, value=1.15, interactive=True)
+            mel_temp = gr.Slider(label="Sampling temperature (mel)", minimum=0.01, maximum=5.0, step=0.05, value=1.3, interactive=True)
+            motion_temp = gr.Slider(label="Sampling temperature (motion)", minimum=0.01, maximum=5.0, step=0.05, value=1.5, interactive=True)
+    synth_btn = gr.Button("Synthesise")
+    with gr.Box():
+        with gr.Row():
+            gr.Markdown("### Phonetised text")
+        with gr.Row():
+            phonetised_text = gr.Textbox(label="Phonetised text", interactive=False)
+    with gr.Box():
+        with gr.Row():
+            mel_spectrogram = gr.Image(interactive=False, label="mel spectrogram")
+            motion_representation = gr.Image(interactive=False, label="Motion representation")
+        with gr.Row():
+            audio = gr.Audio(interactive=False, label="Audio")
+    with gr.Box():
+        with gr.Row():
+            gr.Markdown("### Generate stick figure visualisation")
+        with gr.Row():
+            gr.Markdown("(This will take a while)")
+        with gr.Row():
+            visualize = gr.Button("Visualize", interactive=False)
+        with gr.Row():
+            video = gr.Video(label="Video", interactive=False)
+    synth_btn.click(
+        fn=run,
+        inputs=[
+            text,
+            output,
+            mel_timestep,
+            motion_timestep,
+            length_scale,
+            mel_temp,
+            motion_temp
+        ],
+        outputs=[
+            output,
+            phonetised_text,
+            mel_spectrogram,
+            motion_representation,
+            audio,
+            # video,
+            visualize
+        ], api_name="diff_ttsg")
+    visualize.click(
+        fn=visualize_it,
+        inputs=[output],
+        outputs=[video],
+    )
+demo.queue(1)
+demo.launch()

diff_ttsg/__init__.py ADDED Viewed

File without changes

diff_ttsg/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (152 Bytes). View file

diff_ttsg/data/__init__.py ADDED Viewed

File without changes

diff_ttsg/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (157 Bytes). View file

diff_ttsg/data/__pycache__/cormac_datamodule.cpython-310.pyc ADDED Viewed

Binary file (7.29 kB). View file

diff_ttsg/data/components/__init__.py ADDED Viewed

File without changes

diff_ttsg/data/cormac_datamodule.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import torchaudio as ta
+from einops import pack
+from lightning import LightningDataModule
+from torch.utils.data.dataloader import DataLoader
+from diff_ttsg.text import cmudict, text_to_sequence
+from diff_ttsg.text.symbols import symbols
+from diff_ttsg.utils.audio import mel_spectrogram
+from diff_ttsg.utils.model import fix_len_compatibility, normalize
+from diff_ttsg.utils.utils import intersperse, parse_filelist
+class CormacDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_filelist_path,
+        valid_filelist_path,
+        batch_size,
+        num_workers,
+        pin_memory,
+        cmudict_path,
+        motion_folder,
+        add_blank,
+        n_fft,
+        n_feats,
+        sample_rate,
+        hop_length,
+        win_length,
+        f_min,
+        f_max,
+        data_statistics,
+        motion_pipeline_filename,
+        seed
+    ):
+        super().__init__()
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+    def setup(self, stage: Optional[str] = None):
+        """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute things like random split twice!
+        """
+        # load and split datasets only if not loaded already
+        self.trainset = TextMelDataset(
+            self.hparams.train_filelist_path,
+            self.hparams.cmudict_path,
+            self.hparams.motion_folder,
+            self.hparams.add_blank,
+            self.hparams.n_fft,
+            self.hparams.n_feats,
+            self.hparams.sample_rate,
+            self.hparams.hop_length,
+            self.hparams.win_length,
+            self.hparams.f_min,
+            self.hparams.f_max,
+            self.hparams.data_statistics,
+            self.hparams.seed
+        )
+        self.validset = TextMelDataset(
+            self.hparams.valid_filelist_path,
+            self.hparams.cmudict_path,
+            self.hparams.motion_folder,
+            self.hparams.add_blank,
+            self.hparams.n_fft,
+            self.hparams.n_feats,
+            self.hparams.sample_rate,
+            self.hparams.hop_length,
+            self.hparams.win_length,
+            self.hparams.f_min,
+            self.hparams.f_max,
+            self.hparams.data_statistics,
+            self.hparams.seed
+        )
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.trainset,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=True,
+            collate_fn=TextMelBatchCollate()
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.validset,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=False,
+            collate_fn=TextMelBatchCollate()
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass
+class TextMelDataset(torch.utils.data.Dataset):
+    def __init__(self, filelist_path, cmudict_path, motion_folder, add_blank=True,
+                 n_fft=1024, n_mels=80, sample_rate=22050,
+                 hop_length=256, win_length=1024, f_min=0., f_max=8000, data_parameters=None, seed=None):
+        self.filepaths_and_text = parse_filelist(filelist_path)
+        self.motion_fileloc = Path(motion_folder)
+        self.cmudict = cmudict.CMUDict(cmudict_path)
+        self.add_blank = add_blank
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.f_min = f_min
+        self.f_max = f_max
+        if data_parameters is not None:
+            self.data_parameters = data_parameters
+        else:
+            self.data_parameters = { 'mel_mean': 0, 'mel_std': 1, 'motion_mean': 0, 'motion_std': 1 }
+        random.seed(seed)
+        random.shuffle(self.filepaths_and_text)
+    def get_pair(self, filepath_and_text):
+        filepath, text = filepath_and_text[0], filepath_and_text[1]
+        text = self.get_text(text, add_blank=self.add_blank)
+        mel = self.get_mel(filepath)
+        motion = self.get_motion(filepath, mel.shape[1])
+        return (text, mel, motion)
+    def get_motion(self, filename, mel_shape, ext=".expmap_86.1328125fps.pkl"):
+        file_loc = self.motion_fileloc / Path(Path(filename).name).with_suffix(ext)
+        motion = torch.from_numpy(pd.read_pickle(file_loc).to_numpy())
+        motion = F.interpolate(motion.T.unsqueeze(0), mel_shape).squeeze(0)
+        motion = normalize(motion, self.data_parameters['motion_mean'], self.data_parameters['motion_std'])
+        return motion
+    def get_mel(self, filepath):
+        audio, sr = ta.load(filepath)
+        assert sr == self.sample_rate
+        mel = mel_spectrogram(audio, self.n_fft, 80, self.sample_rate, self.hop_length,
+                              self.win_length, self.f_min, self.f_max, center=False).squeeze()
+        mel = normalize(mel, self.data_parameters['mel_mean'], self.data_parameters['mel_std'])
+        return mel
+    def get_text(self, text, add_blank=True):
+        text_norm = text_to_sequence(text, dictionary=self.cmudict)
+        if self.add_blank:
+            text_norm = intersperse(text_norm, len(symbols))  # add a blank token, whose id number is len(symbols)
+        text_norm = torch.IntTensor(text_norm)
+        return text_norm
+    def __getitem__(self, index):
+        text, mel, motion = self.get_pair(self.filepaths_and_text[index])
+        item = {'y': mel, 'x': text, 'y_motion': motion}
+        return item
+    def __len__(self):
+        return len(self.filepaths_and_text)
+    def sample_test_batch(self, size):
+        idx = np.random.choice(range(len(self)), size=size, replace=False)
+        test_batch = []
+        for index in idx:
+            test_batch.append(self.__getitem__(index))
+        return test_batch
+class TextMelBatchCollate(object):
+    def __call__(self, batch):
+        B = len(batch)
+        y_max_length = max([item['y'].shape[-1] for item in batch])
+        y_max_length = fix_len_compatibility(y_max_length)
+        x_max_length = max([item['x'].shape[-1] for item in batch])
+        n_feats = batch[0]['y'].shape[-2]
+        n_motion = batch[0]['y_motion'].shape[-2]
+        y = torch.zeros((B, n_feats, y_max_length), dtype=torch.float32)
+        x = torch.zeros((B, x_max_length), dtype=torch.long)
+        y_motion = torch.zeros((B, n_motion, y_max_length), dtype=torch.float32)
+        y_lengths, x_lengths = [], []
+        for i, item in enumerate(batch):
+            y_, x_, y_motion_ = item['y'], item['x'], item['y_motion']
+            y_lengths.append(y_.shape[-1])
+            x_lengths.append(x_.shape[-1])
+            y[i, :, :y_.shape[-1]] = y_
+            x[i, :x_.shape[-1]] = x_
+            y_motion[i, :, :y_motion_.shape[-1]] = y_motion_
+        y_lengths = torch.LongTensor(y_lengths)
+        x_lengths = torch.LongTensor(x_lengths)
+        return {'x': x, 'x_lengths': x_lengths, 'y': y, 'y_lengths': y_lengths, 'y_motion': y_motion}

diff_ttsg/data/mnist_datamodule.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from typing import Any, Dict, Optional, Tuple
+import torch
+from lightning import LightningDataModule
+from torch.utils.data import ConcatDataset, DataLoader, Dataset, random_split
+from torchvision.datasets import MNIST
+from torchvision.transforms import transforms
+class MNISTDataModule(LightningDataModule):
+    """Example of LightningDataModule for MNIST dataset.
+    A DataModule implements 6 key methods:
+        def prepare_data(self):
+            # things to do on 1 GPU/TPU (not on every GPU/TPU in DDP)
+            # download data, pre-process, split, save to disk, etc...
+        def setup(self, stage):
+            # things to do on every process in DDP
+            # load data, set variables, etc...
+        def train_dataloader(self):
+            # return train dataloader
+        def val_dataloader(self):
+            # return validation dataloader
+        def test_dataloader(self):
+            # return test dataloader
+        def teardown(self):
+            # called on every process in DDP
+            # clean up after fit or test
+    This allows you to share a full dataset without explaining how to download,
+    split, transform and process the data.
+    Read the docs:
+        https://lightning.ai/docs/pytorch/latest/data/datamodule.html
+    """
+    def __init__(
+        self,
+        data_dir: str = "data/",
+        train_val_test_split: Tuple[int, int, int] = (55_000, 5_000, 10_000),
+        batch_size: int = 64,
+        num_workers: int = 0,
+        pin_memory: bool = False,
+    ):
+        super().__init__()
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+        # data transformations
+        self.transforms = transforms.Compose(
+            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+        )
+        self.data_train: Optional[Dataset] = None
+        self.data_val: Optional[Dataset] = None
+        self.data_test: Optional[Dataset] = None
+    @property
+    def num_classes(self):
+        return 10
+    def prepare_data(self):
+        """Download data if needed.
+        Do not use it to assign state (self.x = y).
+        """
+        MNIST(self.hparams.data_dir, train=True, download=True)
+        MNIST(self.hparams.data_dir, train=False, download=True)
+    def setup(self, stage: Optional[str] = None):
+        """Load data. Set variables: `self.data_train`, `self.data_val`, `self.data_test`.
+        This method is called by lightning with both `trainer.fit()` and `trainer.test()`, so be
+        careful not to execute things like random split twice!
+        """
+        # load and split datasets only if not loaded already
+        if not self.data_train and not self.data_val and not self.data_test:
+            trainset = MNIST(self.hparams.data_dir, train=True, transform=self.transforms)
+            testset = MNIST(self.hparams.data_dir, train=False, transform=self.transforms)
+            dataset = ConcatDataset(datasets=[trainset, testset])
+            self.data_train, self.data_val, self.data_test = random_split(
+                dataset=dataset,
+                lengths=self.hparams.train_val_test_split,
+                generator=torch.Generator().manual_seed(42),
+            )
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.data_train,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.data_val,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=False,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.data_test,
+            batch_size=self.hparams.batch_size,
+            num_workers=self.hparams.num_workers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=False,
+        )
+    def teardown(self, stage: Optional[str] = None):
+        """Clean up after fit or test."""
+        pass
+    def state_dict(self):
+        """Extra things to save to checkpoint."""
+        return {}
+    def load_state_dict(self, state_dict: Dict[str, Any]):
+        """Things to do when loading checkpoint."""
+        pass
+if __name__ == "__main__":
+    _ = MNISTDataModule()

diff_ttsg/eval.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from typing import List, Tuple
+import hydra
+import pyrootutils
+from lightning import LightningDataModule, LightningModule, Trainer
+from lightning.pytorch.loggers import Logger
+from omegaconf import DictConfig
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+# ------------------------------------------------------------------------------------ #
+# the setup_root above is equivalent to:
+# - adding project root dir to PYTHONPATH
+#       (so you don't need to force user to install project as a package)
+#       (necessary before importing any local modules e.g. `from src import utils`)
+# - setting up PROJECT_ROOT environment variable
+#       (which is used as a base for paths in "configs/paths/default.yaml")
+#       (this way all filepaths are the same no matter where you run the code)
+# - loading environment variables from ".env" in root dir
+#
+# you can remove it if you:
+# 1. either install project as a package or move entry files to project root dir
+# 2. set `root_dir` to "." in "configs/paths/default.yaml"
+#
+# more info: https://github.com/ashleve/pyrootutils
+# ------------------------------------------------------------------------------------ #
+from diff_ttsg import utils
+log = utils.get_pylogger(__name__)
+@utils.task_wrapper
+def evaluate(cfg: DictConfig) -> Tuple[dict, dict]:
+    """Evaluates given checkpoint on a datamodule testset.
+    This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
+    failure. Useful for multiruns, saving info about the crash, etc.
+    Args:
+        cfg (DictConfig): Configuration composed by Hydra.
+    Returns:
+        Tuple[dict, dict]: Dict with metrics and dict with all instantiated objects.
+    """
+    assert cfg.ckpt_path
+    log.info(f"Instantiating datamodule <{cfg.data._target_}>")
+    datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data)
+    log.info(f"Instantiating model <{cfg.model._target_}>")
+    model: LightningModule = hydra.utils.instantiate(cfg.model)
+    log.info("Instantiating loggers...")
+    logger: List[Logger] = utils.instantiate_loggers(cfg.get("logger"))
+    log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
+    trainer: Trainer = hydra.utils.instantiate(cfg.trainer, logger=logger)
+    object_dict = {
+        "cfg": cfg,
+        "datamodule": datamodule,
+        "model": model,
+        "logger": logger,
+        "trainer": trainer,
+    }
+    if logger:
+        log.info("Logging hyperparameters!")
+        utils.log_hyperparameters(object_dict)
+    log.info("Starting testing!")
+    trainer.test(model=model, datamodule=datamodule, ckpt_path=cfg.ckpt_path)
+    # for predictions use trainer.predict(...)
+    # predictions = trainer.predict(model=model, dataloaders=dataloaders, ckpt_path=cfg.ckpt_path)
+    metric_dict = trainer.callback_metrics
+    return metric_dict, object_dict
+@hydra.main(version_base="1.3", config_path="../configs", config_name="eval.yaml")
+def main(cfg: DictConfig) -> None:
+    # apply extra utilities
+    # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.)
+    utils.extras(cfg)
+    evaluate(cfg)
+if __name__ == "__main__":
+    main()

diff_ttsg/hifigan/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 Jungil Kong
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

diff_ttsg/hifigan/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
+### Jungil Kong, Jaehyeon Kim, Jaekyoung Bae
+In our [paper](https://arxiv.org/abs/2010.05646),
+we proposed HiFi-GAN: a GAN-based model capable of generating high fidelity speech efficiently.<br/>
+We provide our implementation and pretrained models as open source in this repository.
+**Abstract :**
+Several recent work on speech synthesis have employed generative adversarial networks (GANs) to produce raw waveforms.
+Although such methods improve the sampling efficiency and memory usage,
+their sample quality has not yet reached that of autoregressive and flow-based generative models.
+In this work, we propose HiFi-GAN, which achieves both efficient and high-fidelity speech synthesis.
+As speech audio consists of sinusoidal signals with various periods,
+we demonstrate that modeling periodic patterns of an audio is crucial for enhancing sample quality.
+A subjective human evaluation (mean opinion score, MOS) of a single speaker dataset indicates that our proposed method
+demonstrates similarity to human quality while generating 22.05 kHz high-fidelity audio 167.9 times faster than
+real-time on a single V100 GPU. We further show the generality of HiFi-GAN to the mel-spectrogram inversion of unseen
+speakers and end-to-end speech synthesis. Finally, a small footprint version of HiFi-GAN generates samples 13.4 times
+faster than real-time on CPU with comparable quality to an autoregressive counterpart.
+Visit our [demo website](https://jik876.github.io/hifi-gan-demo/) for audio samples.
+## Pre-requisites
+1. Python >= 3.6
+2. Clone this repository.
+3. Install python requirements. Please refer [requirements.txt](requirements.txt)
+4. Download and extract the [LJ Speech dataset](https://keithito.com/LJ-Speech-Dataset/).
+And move all wav files to `LJSpeech-1.1/wavs`
+## Training
+```
+python train.py --config config_v1.json
+```
+To train V2 or V3 Generator, replace `config_v1.json` with `config_v2.json` or `config_v3.json`.<br>
+Checkpoints and copy of the configuration file are saved in `cp_hifigan` directory by default.<br>
+You can change the path by adding `--checkpoint_path` option.
+Validation loss during training with V1 generator.<br>
+![validation loss](./validation_loss.png)
+## Pretrained Model
+You can also use pretrained models we provide.<br/>
+[Download pretrained models](https://drive.google.com/drive/folders/1-eEYTB5Av9jNql0WGBlRoi-WH2J7bp5Y?usp=sharing)<br/>
+Details of each folder are as in follows:
+|Folder Name|Generator|Dataset|Fine-Tuned|
+|------|---|---|---|
+|LJ_V1|V1|LJSpeech|No|
+|LJ_V2|V2|LJSpeech|No|
+|LJ_V3|V3|LJSpeech|No|
+|LJ_FT_T2_V1|V1|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|LJ_FT_T2_V2|V2|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|LJ_FT_T2_V3|V3|LJSpeech|Yes ([Tacotron2](https://github.com/NVIDIA/tacotron2))|
+|VCTK_V1|V1|VCTK|No|
+|VCTK_V2|V2|VCTK|No|
+|VCTK_V3|V3|VCTK|No|
+|UNIVERSAL_V1|V1|Universal|No|
+We provide the universal model with discriminator weights that can be used as a base for transfer learning to other datasets.
+## Fine-Tuning
+1. Generate mel-spectrograms in numpy format using [Tacotron2](https://github.com/NVIDIA/tacotron2) with teacher-forcing.<br/>
+The file name of the generated mel-spectrogram should match the audio file and the extension should be `.npy`.<br/>
+Example:
+    ```
+    Audio File : LJ001-0001.wav
+    Mel-Spectrogram File : LJ001-0001.npy
+    ```
+2. Create `ft_dataset` folder and copy the generated mel-spectrogram files into it.<br/>
+3. Run the following command.
+    ```
+    python train.py --fine_tuning True --config config_v1.json
+    ```
+    For other command line options, please refer to the training section.
+## Inference from wav file
+1. Make `test_files` directory and copy wav files into the directory.
+2. Run the following command.
+    ```
+    python inference.py --checkpoint_file [generator checkpoint file path]
+    ```
+Generated wav files are saved in `generated_files` by default.<br>
+You can change the path by adding `--output_dir` option.
+## Inference for end-to-end speech synthesis
+1. Make `test_mel_files` directory and copy generated mel-spectrogram files into the directory.<br>
+You can generate mel-spectrograms using [Tacotron2](https://github.com/NVIDIA/tacotron2),
+[Glow-TTS](https://github.com/jaywalnut310/glow-tts) and so forth.
+2. Run the following command.
+    ```
+    python inference_e2e.py --checkpoint_file [generator checkpoint file path]
+    ```
+Generated wav files are saved in `generated_files_from_mel` by default.<br>
+You can change the path by adding `--output_dir` option.
+## Acknowledgements
+We referred to [WaveGlow](https://github.com/NVIDIA/waveglow), [MelGAN](https://github.com/descriptinc/melgan-neurips)
+and [Tacotron2](https://github.com/NVIDIA/tacotron2) to implement this.

diff_ttsg/hifigan/__init__.py ADDED Viewed

File without changes

diff_ttsg/hifigan/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (160 Bytes). View file

diff_ttsg/hifigan/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

diff_ttsg/hifigan/__pycache__/denoiser.cpython-310.pyc ADDED Viewed

Binary file (2.56 kB). View file

diff_ttsg/hifigan/__pycache__/env.cpython-310.pyc ADDED Viewed

Binary file (883 Bytes). View file

diff_ttsg/hifigan/__pycache__/models.cpython-310.pyc ADDED Viewed

Binary file (8.73 kB). View file

diff_ttsg/hifigan/__pycache__/xutils.cpython-310.pyc ADDED Viewed

Binary file (2.1 kB). View file

diff_ttsg/hifigan/config.py ADDED Viewed

	@@ -0,0 +1,38 @@

+v1 = {
+    "resblock": "1",
+    "num_gpus": 0,
+    "batch_size": 16,
+    "learning_rate": 0.0004,
+    "adam_b1": 0.8,
+    "adam_b2": 0.99,
+    "lr_decay": 0.999,
+    "seed": 1234,
+    "upsample_rates": [8,8,2,2],
+    "upsample_kernel_sizes": [16,16,4,4],
+    "upsample_initial_channel": 512,
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "resblock_initial_channel": 256,
+    "segment_size": 8192,
+    "num_mels": 80,
+    "num_freq": 1025,
+    "n_fft": 1024,
+    "hop_size": 256,
+    "win_size": 1024,
+    "sampling_rate": 22050,
+    "fmin": 0,
+    "fmax": 8000,
+    "fmax_loss": None,
+    "num_workers": 4,
+    "dist_config": {
+        "dist_backend": "nccl",
+        "dist_url": "tcp://localhost:54321",
+        "world_size": 1
+    }
+}

diff_ttsg/hifigan/denoiser.py ADDED Viewed

	@@ -0,0 +1,64 @@

+### Code modified from Rafael Valle's implementation https://github.com/NVIDIA/waveglow/blob/5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1/denoiser.py
+"""Waveglow style denoiser can be used to remove the artifacts from the HiFiGAN generated audio."""
+import torch
+class Denoiser(torch.nn.Module):
+    """Removes model bias from audio produced with waveglow"""
+    def __init__(self, vocoder, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"):
+        super().__init__()
+        self.filter_length = filter_length
+        self.hop_length = int(filter_length / n_overlap)
+        self.win_length = win_length
+        dtype, device = next(vocoder.parameters()).dtype, next(vocoder.parameters()).device
+        self.device = device
+        if mode == "zeros":
+            mel_input = torch.zeros((1, 80, 88), dtype=dtype, device=device)
+        elif mode == "normal":
+            mel_input = torch.randn((1, 80, 88), dtype=dtype, device=device)
+        else:
+            raise Exception(f"Mode {mode} if not supported")
+        def stft_fn(audio, n_fft, hop_length, win_length, window):
+            spec = torch.stft(
+                audio,
+                n_fft=n_fft,
+                hop_length=hop_length,
+                win_length=win_length,
+                window=window,
+                return_complex=True,
+            )
+            spec = torch.view_as_real(spec)
+            return torch.sqrt(spec.pow(2).sum(-1)), torch.atan2(spec[..., -1], spec[..., 0])
+        self.stft = lambda x : stft_fn(
+            audio=x,
+            n_fft=self.filter_length,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=torch.hann_window(self.win_length, device=device)
+        )
+        self.istft = lambda x, y: torch.istft(
+                torch.complex(x * torch.cos(y), x * torch.sin(y)),
+                n_fft=self.filter_length,
+                hop_length=self.hop_length,
+                win_length=self.win_length,
+                window=torch.hann_window(self.win_length, device=device),
+        )
+        with torch.no_grad():
+            bias_audio = vocoder(mel_input).float().squeeze(0)
+            bias_spec, _ = self.stft(bias_audio)
+        self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
+    @torch.inference_mode()
+    def forward(self, audio, strength=0.0005):
+        audio_spec, audio_angles = self.stft(audio)
+        audio_spec_denoised = audio_spec - self.bias_spec.to(audio.device) * strength
+        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
+        audio_denoised = self.istft(audio_spec_denoised, audio_angles)
+        return audio_denoised

diff_ttsg/hifigan/env.py ADDED Viewed

	@@ -0,0 +1,17 @@

+""" from https://github.com/jik876/hifi-gan """
+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

diff_ttsg/hifigan/meldataset.py ADDED Viewed

	@@ -0,0 +1,171 @@

+""" from https://github.com/jik876/hifi-gan """
+import math
+import os
+import random
+import numpy as np
+import torch
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+from librosa.util import normalize
+from scipy.io.wavfile import read
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.view_as_real(torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True))
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def get_dataset_filelist(a):
+    with open(a.input_training_file, 'r', encoding='utf-8') as fi:
+        training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
+                          for x in fi.read().split('\n') if len(x) > 0]
+    with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
+        validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
+                            for x in fi.read().split('\n') if len(x) > 0]
+    return training_files, validation_files
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self, training_files, segment_size, n_fft, num_mels,
+                 hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
+                 device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            audio, sampling_rate = load_wav(filename)
+            audio = audio / MAX_WAV_VALUE
+            if not self.fine_tuning:
+                audio = normalize(audio) * 0.95
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate))
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start+self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
+            mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
+                                  self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
+                                  center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
+            mel = torch.from_numpy(mel)
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
+                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
+        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
+                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
+                                   center=False)
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+    def __len__(self):
+        return len(self.audio_files)

diff_ttsg/hifigan/models.py ADDED Viewed

	@@ -0,0 +1,286 @@

+""" from https://github.com/jik876/hifi-gan """
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
+from .xutils import get_padding, init_weights
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=2),
+            AvgPool1d(4, 2, padding=2)
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i-1](y)
+                y_hat = self.meanpools[i-1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss*2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses

diff_ttsg/hifigan/xutils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+""" from https://github.com/jik876/hifi-gan """
+import glob
+import os
+import matplotlib
+import torch
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(spectrogram, aspect="auto", origin="lower",
+                   interpolation='none')
+    plt.colorbar(im, ax=ax)
+    fig.canvas.draw()
+    plt.close()
+    return fig
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]

diff_ttsg/models/__init__.py ADDED Viewed

File without changes

diff_ttsg/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (159 Bytes). View file

diff_ttsg/models/__pycache__/diff_ttsg.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

diff_ttsg/models/components/__init__.py ADDED Viewed

File without changes

diff_ttsg/models/components/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (170 Bytes). View file

diff_ttsg/models/components/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (12.6 kB). View file

diff_ttsg/models/components/__pycache__/text_encoder.cpython-310.pyc ADDED Viewed

Binary file (12.3 kB). View file

diff_ttsg/models/components/__pycache__/transformer.cpython-310.pyc ADDED Viewed

Binary file (6.03 kB). View file

diff_ttsg/models/components/diffusion.py ADDED Viewed

	@@ -0,0 +1,376 @@

+# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the MIT License.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# MIT License for more details.
+import math
+import torch
+from diffusers import UNet1DModel
+from einops import pack, rearrange
+class Mish(torch.nn.Module):
+    def forward(self, x):
+        return x * torch.tanh(torch.nn.functional.softplus(x))
+class Upsample(torch.nn.Module):
+    def __init__(self, dim):
+        super(Upsample, self).__init__()
+        self.conv = torch.nn.ConvTranspose2d(dim, dim, 4, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class Downsample(torch.nn.Module):
+    def __init__(self, dim):
+        super(Downsample, self).__init__()
+        self.conv = torch.nn.Conv2d(dim, dim, 3, 2, 1)
+    def forward(self, x):
+        return self.conv(x)
+class Rezero(torch.nn.Module):
+    def __init__(self, fn):
+        super(Rezero, self).__init__()
+        self.fn = fn
+        self.g = torch.nn.Parameter(torch.zeros(1))
+    def forward(self, x):
+        return self.fn(x) * self.g
+class Block(torch.nn.Module):
+    def __init__(self, dim, dim_out, groups=8):
+        super(Block, self).__init__()
+        self.block = torch.nn.Sequential(torch.nn.Conv2d(dim, dim_out, 3,
+                                         padding=1), torch.nn.GroupNorm(
+                                         groups, dim_out), Mish())
+    def forward(self, x, mask):
+        output = self.block(x * mask)
+        return output * mask
+class ResnetBlock(torch.nn.Module):
+    def __init__(self, dim, dim_out, time_emb_dim, groups=8):
+        super(ResnetBlock, self).__init__()
+        self.mlp = torch.nn.Sequential(Mish(), torch.nn.Linear(time_emb_dim,
+                                                               dim_out))
+        self.block1 = Block(dim, dim_out, groups=groups)
+        self.block2 = Block(dim_out, dim_out, groups=groups)
+        if dim != dim_out:
+            self.res_conv = torch.nn.Conv2d(dim, dim_out, 1)
+        else:
+            self.res_conv = torch.nn.Identity()
+    def forward(self, x, mask, time_emb):
+        h = self.block1(x, mask)
+        h += self.mlp(time_emb).unsqueeze(-1).unsqueeze(-1)
+        h = self.block2(h, mask)
+        output = h + self.res_conv(x * mask)
+        return output
+class LinearAttention(torch.nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(LinearAttention, self).__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = torch.nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = torch.nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)',
+                            heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w',
+                        heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+class Residual(torch.nn.Module):
+    def __init__(self, fn):
+        super(Residual, self).__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        output = self.fn(x, *args, **kwargs) + x
+        return output
+class UNet1DDiffuser(torch.nn.Module):
+    def __init__(self, in_channels=90, out_channels=45, block_out_channels=(256, 512)):
+        super(UNet1DDiffuser, self).__init__()
+        self.unet = UNet1DModel(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            down_block_types = ("DownBlock1DNoSkip", "AttnDownBlock1D"),
+            up_block_types = ("AttnUpBlock1D", "UpBlock1DNoSkip"),
+            mid_block_type = "UNetMidBlock1D",
+            block_out_channels=block_out_channels,
+            use_timestep_embedding=True,
+        )
+    def forward(self, x, mask, mu, t, spk=None):
+        x = pack([x, mu], "b * t")[0]
+        return self.unet(x, t).sample * mask
+class SinusoidalPosEmb(torch.nn.Module):
+    def __init__(self, dim):
+        super(SinusoidalPosEmb, self).__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class GradLogPEstimator2d(torch.nn.Module):
+    def __init__(self, dim, dim_mults=(1, 2, 4), groups=8,
+                 n_spks=None, spk_emb_dim=64, n_feats=80, pe_scale=1000):
+        super(GradLogPEstimator2d, self).__init__()
+        self.dim = dim
+        self.dim_mults = dim_mults
+        self.groups = groups
+        self.n_spks = n_spks if not isinstance(n_spks, type(None)) else 1
+        self.spk_emb_dim = spk_emb_dim
+        self.pe_scale = pe_scale
+        if n_spks > 1:
+            self.spk_mlp = torch.nn.Sequential(torch.nn.Linear(spk_emb_dim, spk_emb_dim * 4), Mish(),
+                                               torch.nn.Linear(spk_emb_dim * 4, n_feats))
+        self.time_pos_emb = SinusoidalPosEmb(dim)
+        self.mlp = torch.nn.Sequential(torch.nn.Linear(dim, dim * 4), Mish(),
+                                       torch.nn.Linear(dim * 4, dim))
+        dims = [2 + (1 if n_spks > 1 else 0), *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        self.downs = torch.nn.ModuleList([])
+        self.ups = torch.nn.ModuleList([])
+        num_resolutions = len(in_out)
+        for ind, (dim_in, dim_out) in enumerate(in_out):
+            is_last = ind >= (num_resolutions - 1)
+            self.downs.append(torch.nn.ModuleList([
+                       ResnetBlock(dim_in, dim_out, time_emb_dim=dim),
+                       ResnetBlock(dim_out, dim_out, time_emb_dim=dim),
+                       Residual(Rezero(LinearAttention(dim_out))),
+                       Downsample(dim_out) if not is_last else torch.nn.Identity()]))
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
+        self.mid_attn = Residual(Rezero(LinearAttention(mid_dim)))
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, time_emb_dim=dim)
+        for ind, (dim_in, dim_out) in enumerate(reversed(in_out[1:])):
+            self.ups.append(torch.nn.ModuleList([
+                     ResnetBlock(dim_out * 2, dim_in, time_emb_dim=dim),
+                     ResnetBlock(dim_in, dim_in, time_emb_dim=dim),
+                     Residual(Rezero(LinearAttention(dim_in))),
+                     Upsample(dim_in)]))
+        self.final_block = Block(dim, dim)
+        self.final_conv = torch.nn.Conv2d(dim, 1, 1)
+    def forward(self, x, mask, mu, t, spk=None):
+        if not isinstance(spk, type(None)):
+            s = self.spk_mlp(spk)
+        t = self.time_pos_emb(t, scale=self.pe_scale)
+        t = self.mlp(t)
+        if self.n_spks < 2:
+            x = torch.stack([mu, x], 1)
+        else:
+            s = s.unsqueeze(-1).repeat(1, 1, x.shape[-1])
+            x = torch.stack([mu, x, s], 1)
+        mask = mask.unsqueeze(1)
+        hiddens = []
+        masks = [mask]
+        for resnet1, resnet2, attn, downsample in self.downs:
+            mask_down = masks[-1]
+            x = resnet1(x, mask_down, t)
+            x = resnet2(x, mask_down, t)
+            x = attn(x)
+            hiddens.append(x)
+            x = downsample(x * mask_down)
+            masks.append(mask_down[:, :, :, ::2])
+        masks = masks[:-1]
+        mask_mid = masks[-1]
+        x = self.mid_block1(x, mask_mid, t)
+        x = self.mid_attn(x)
+        x = self.mid_block2(x, mask_mid, t)
+        for resnet1, resnet2, attn, upsample in self.ups:
+            mask_up = masks.pop()
+            x = torch.cat((x, hiddens.pop()), dim=1)
+            x = resnet1(x, mask_up, t)
+            x = resnet2(x, mask_up, t)
+            x = attn(x)
+            x = upsample(x * mask_up)
+        x = self.final_block(x, mask)
+        output = self.final_conv(x * mask)
+        return (output * mask).squeeze(1)
+def get_noise(t, beta_init, beta_term, cumulative=False):
+    if cumulative:
+        noise = beta_init*t + 0.5*(beta_term - beta_init)*(t**2)
+    else:
+        noise = beta_init + (beta_term - beta_init)*t
+    return noise
+class Diffusion(torch.nn.Module):
+    def __init__(self, n_feats, dim,
+                 n_spks=1, spk_emb_dim=64,
+                 beta_min=0.05, beta_max=20, pe_scale=1000):
+        super(Diffusion, self).__init__()
+        self.n_feats = n_feats
+        self.dim = dim
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.pe_scale = pe_scale
+        self.estimator = GradLogPEstimator2d(dim, n_spks=n_spks,
+                                             spk_emb_dim=spk_emb_dim,
+                                             pe_scale=pe_scale)
+    def forward_diffusion(self, x0, mask, mu, t):
+        time = t.unsqueeze(-1).unsqueeze(-1)
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        mean = x0*torch.exp(-0.5*cum_noise) + mu*(1.0 - torch.exp(-0.5*cum_noise))
+        variance = 1.0 - torch.exp(-cum_noise)
+        z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device,
+                        requires_grad=False)
+        xt = mean + z * torch.sqrt(variance)
+        return xt * mask, z * mask
+    @torch.no_grad()
+    def reverse_diffusion(self, z, mask, mu, n_timesteps, stoc=False, spk=None):
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5)*h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                 device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk)
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                       requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk))
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt
+    @torch.no_grad()
+    def forward(self, z, mask, mu, n_timesteps, stoc=False, spk=None):
+        return self.reverse_diffusion(z, mask, mu, n_timesteps, stoc, spk)
+    def loss_t(self, x0, mask, mu, t, spk=None):
+        xt, z = self.forward_diffusion(x0, mask, mu, t)
+        time = t.unsqueeze(-1).unsqueeze(-1) # t =[0.6215, 0.0191, 0.0391]
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        noise_estimation = self.estimator(xt, mask, mu, t, spk) # xt = [3, 80, 172], mask=[3, 1, 172], mu=[3, 80, 172], t=[3]
+        noise_estimation *= torch.sqrt(1.0 - torch.exp(-cum_noise))
+        loss = torch.sum((noise_estimation + z)**2) / (torch.sum(mask)*self.n_feats)
+        return loss, xt
+    def compute_loss(self, x0, mask, mu, spk=None, offset=1e-5):
+        t = torch.rand(x0.shape[0], dtype=x0.dtype, device=x0.device,
+                       requires_grad=False)
+        t = torch.clamp(t, offset, 1.0 - offset)
+        return self.loss_t(x0, mask, mu, t, spk)
+class Diffusion_Motion(torch.nn.Module):
+    def __init__(self, in_channels, motion_decoder_channels=(256, 256), beta_min=0.05, beta_max=20):
+        super(Diffusion_Motion, self).__init__()
+        self.in_channels = in_channels
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.estimator = UNet1DDiffuser(block_out_channels=motion_decoder_channels)
+    def forward_diffusion(self, x0, mask, mu, t):
+        time = t.unsqueeze(-1).unsqueeze(-1)
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        mean = x0*torch.exp(-0.5*cum_noise) + mu*(1.0 - torch.exp(-0.5*cum_noise))
+        variance = 1.0 - torch.exp(-cum_noise)
+        z = torch.randn(x0.shape, dtype=x0.dtype, device=x0.device,
+                        requires_grad=False)
+        xt = mean + z * torch.sqrt(variance)
+        return xt * mask, z * mask
+    @torch.no_grad()
+    def reverse_diffusion(self, z, mask, mu, n_timesteps, stoc=False, spk=None):
+        h = 1.0 / n_timesteps
+        xt = z * mask
+        for i in range(n_timesteps):
+            t = (1.0 - (i + 0.5)*h) * torch.ones(z.shape[0], dtype=z.dtype,
+                                                 device=z.device)
+            time = t.unsqueeze(-1).unsqueeze(-1)
+            noise_t = get_noise(time, self.beta_min, self.beta_max,
+                                cumulative=False)
+            if stoc:  # adds stochastic term
+                dxt_det = 0.5 * (mu - xt) - self.estimator(xt, mask, mu, t, spk)
+                dxt_det = dxt_det * noise_t * h
+                dxt_stoc = torch.randn(z.shape, dtype=z.dtype, device=z.device,
+                                       requires_grad=False)
+                dxt_stoc = dxt_stoc * torch.sqrt(noise_t * h)
+                dxt = dxt_det + dxt_stoc
+            else:
+                dxt = 0.5 * (mu - xt - self.estimator(xt, mask, mu, t, spk))
+                dxt = dxt * noise_t * h
+            xt = (xt - dxt) * mask
+        return xt
+    @torch.no_grad()
+    def forward(self, z, mask, mu, n_timesteps, stoc=False, spk=None):
+        return self.reverse_diffusion(z, mask, mu, n_timesteps, stoc, spk)
+    def loss_t(self, x0, mask, mu, t, spk=None):
+        xt, z = self.forward_diffusion(x0, mask, mu, t)
+        time = t.unsqueeze(-1).unsqueeze(-1) # t =[0.6215, 0.0191, 0.0391]
+        cum_noise = get_noise(time, self.beta_min, self.beta_max, cumulative=True)
+        noise_estimation = self.estimator(xt, mask, mu, t, spk) # xt = [3, 80, 172], mask=[3, 1, 172], mu=[3, 80, 172], t=[3]
+        noise_estimation *= torch.sqrt(1.0 - torch.exp(-cum_noise))
+        loss = torch.sum((noise_estimation + z)**2) / (torch.sum(mask)*self.in_channels)
+        return loss, xt
+    def compute_loss(self, x0, mask, mu, spk=None, offset=1e-5):
+        t = torch.rand(x0.shape[0], dtype=x0.dtype, device=x0.device,
+                       requires_grad=False)
+        t = torch.clamp(t, offset, 1.0 - offset)
+        return self.loss_t(x0, mask, mu, t, spk)

diff_ttsg/models/components/text_encoder.py ADDED Viewed

	@@ -0,0 +1,384 @@

+""" from https://github.com/jaywalnut310/glow-tts """
+import math
+import torch
+import torch.nn as nn
+from conformer import ConformerBlock
+from einops import rearrange
+from diff_ttsg.models.components.transformer import FFTransformer
+from diff_ttsg.utils.model import convert_pad_shape, sequence_mask
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-4):
+        super(LayerNorm, self).__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = torch.nn.Parameter(torch.ones(channels))
+        self.beta = torch.nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        n_dims = len(x.shape)
+        mean = torch.mean(x, 1, keepdim=True)
+        variance = torch.mean((x - mean)**2, 1, keepdim=True)
+        x = (x - mean) * torch.rsqrt(variance + self.eps)
+        shape = [1, -1] + [1] * (n_dims - 2)
+        x = x * self.gamma.view(*shape) + self.beta.view(*shape)
+        return x
+class ConvReluNorm(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size,
+                 n_layers, p_dropout):
+        super(ConvReluNorm, self).__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.conv_layers = torch.nn.ModuleList()
+        self.norm_layers = torch.nn.ModuleList()
+        self.conv_layers.append(torch.nn.Conv1d(in_channels, hidden_channels,
+                                                kernel_size, padding=kernel_size//2))
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = torch.nn.Sequential(torch.nn.ReLU(), torch.nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(torch.nn.Conv1d(hidden_channels, hidden_channels,
+                                                    kernel_size, padding=kernel_size//2))
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = torch.nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+class DurationPredictor(nn.Module):
+    def __init__(self, in_channels, filter_channels, kernel_size, p_dropout):
+        super(DurationPredictor, self).__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.p_dropout = p_dropout
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels,
+                                      kernel_size, padding=kernel_size//2)
+        self.norm_1 = LayerNorm(filter_channels)
+        self.conv_2 = torch.nn.Conv1d(filter_channels, filter_channels,
+                                      kernel_size, padding=kernel_size//2)
+        self.norm_2 = LayerNorm(filter_channels)
+        self.proj = torch.nn.Conv1d(filter_channels, 1, 1)
+    def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class MultiHeadAttention(nn.Module):
+    def __init__(self, channels, out_channels, n_heads, window_size=None,
+                 heads_share=True, p_dropout=0.0, proximal_bias=False,
+                 proximal_init=False):
+        super(MultiHeadAttention, self).__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.proximal_bias = proximal_bias
+        self.p_dropout = p_dropout
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_k = torch.nn.Conv1d(channels, channels, 1)
+        self.conv_v = torch.nn.Conv1d(channels, channels, 1)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = torch.nn.Parameter(torch.randn(n_heads_rel,
+                             window_size * 2 + 1, self.k_channels) * rel_stddev)
+            self.emb_rel_v = torch.nn.Parameter(torch.randn(n_heads_rel,
+                             window_size * 2 + 1, self.k_channels) * rel_stddev)
+        self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
+        self.drop = torch.nn.Dropout(p_dropout)
+        torch.nn.init.xavier_uniform_(self.conv_q.weight)
+        torch.nn.init.xavier_uniform_(self.conv_k.weight)
+        if proximal_init:
+            self.conv_k.weight.data.copy_(self.conv_q.weight.data)
+            self.conv_k.bias.data.copy_(self.conv_q.bias.data)
+        torch.nn.init.xavier_uniform_(self.conv_v.weight)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.k_channels)
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query, key_relative_embeddings)
+            rel_logits = self._relative_position_to_absolute_position(rel_logits)
+            scores_local = rel_logits / math.sqrt(self.k_channels)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device,
+                                                                    dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+        p_attn = torch.nn.functional.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights,
+                                                                value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = torch.nn.functional.pad(
+                            relative_embeddings, convert_pad_shape([[0, 0],
+                            [pad_length, pad_length], [0, 0]]))
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:,
+                                   slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(x, convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0,0],[0,0],[0,length-1]]))
+        x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = torch.nn.functional.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
+        x_flat = x.view([batch, heads, length**2 + length*(length - 1)])
+        x_flat = torch.nn.functional.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(self, in_channels, out_channels, filter_channels, kernel_size,
+                 p_dropout=0.0):
+        super(FFN, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size,
+                                      padding=kernel_size//2)
+        self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size,
+                                      padding=kernel_size//2)
+        self.drop = torch.nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        return x * x_mask
+class Encoder(nn.Module):
+    def __init__(self, hidden_channels, filter_channels, n_heads, n_layers,
+                 kernel_size=1, p_dropout=0.0, window_size=None, **kwargs):
+        super(Encoder, self).__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = torch.nn.Dropout(p_dropout)
+        self.attn_layers = torch.nn.ModuleList()
+        self.norm_layers_1 = torch.nn.ModuleList()
+        self.ffn_layers = torch.nn.ModuleList()
+        self.norm_layers_2 = torch.nn.ModuleList()
+        for _ in range(self.n_layers):
+            self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels,
+                                    n_heads, window_size=window_size, p_dropout=p_dropout))
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(FFN(hidden_channels, hidden_channels,
+                                       filter_channels, kernel_size, p_dropout=p_dropout))
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        for i in range(self.n_layers):
+            x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class TextEncoder(nn.Module):
+    def __init__(self, n_vocab, n_feats, n_channels, filter_channels,
+                 filter_channels_dp, n_heads, n_layers, kernel_size,
+                 p_dropout, window_size=None, spk_emb_dim=64, n_spks=1, encoder_type=None):
+        super(TextEncoder, self).__init__()
+        self.n_vocab = n_vocab
+        self.n_feats = n_feats
+        self.n_channels = n_channels
+        self.filter_channels = filter_channels
+        self.filter_channels_dp = filter_channels_dp
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.spk_emb_dim = spk_emb_dim
+        self.n_spks = n_spks
+        self.emb = torch.nn.Embedding(n_vocab, n_channels)
+        torch.nn.init.normal_(self.emb.weight, 0.0, n_channels**-0.5)
+        self.prenet = ConvReluNorm(n_channels, n_channels, n_channels,
+                                   kernel_size=5, n_layers=3, p_dropout=0.5)
+        if encoder_type == "default":
+            self.encoder = Encoder(n_channels + (spk_emb_dim if n_spks > 1 else 0), filter_channels, n_heads, n_layers,
+                                kernel_size, p_dropout, window_size=window_size)
+        elif encoder_type == "myencoder":
+            self.encoder = FFTransformer(
+                n_layers, n_heads, n_channels + (spk_emb_dim if n_spks > 1 else 0), 64, 1024, kernel_size,
+                p_dropout, p_dropout, rel_attention=False, rel_window_size=window_size
+            )
+        else:
+            raise ValueError(f"Unknown encoder type: {encoder_type}")
+        self.proj_m = torch.nn.Conv1d(n_channels + (spk_emb_dim if n_spks > 1 else 0), n_feats, 1)
+        self.proj_w = DurationPredictor(n_channels + (spk_emb_dim if n_spks > 1 else 0), filter_channels_dp,
+                                        kernel_size, p_dropout)
+    def forward(self, x, x_lengths, spk=None):
+        x = self.emb(x) * math.sqrt(self.n_channels)
+        x = torch.transpose(x, 1, -1)
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.prenet(x, x_mask)
+        if self.n_spks > 1:
+            x = torch.cat([x, spk.unsqueeze(-1).repeat(1, 1, x.shape[-1])], dim=1)
+        x = self.encoder(x, x_mask)
+        mu = self.proj_m(x) * x_mask
+        x_dp = torch.detach(x)
+        logw = self.proj_w(x_dp, x_mask)
+        return mu, logw, x_mask
+class MuMotionEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels,
+        output_channels,
+        hidden_channels,
+        d_head,
+        n_layer,
+        n_head,
+        ff_mult,
+        conv_expansion_factor,
+        dropout,
+        dropatt,
+        dropconv,
+        conv_kernel_size,
+    ) -> None:
+        super().__init__()
+        self.in_projection = nn.Conv1d(input_channels, hidden_channels, 1)
+        self.layers = nn.ModuleList()
+        for _ in range(n_layer):
+            self.layers.append(
+                ConformerBlock(
+                    dim=hidden_channels,
+                    dim_head=d_head,
+                    heads=n_head,
+                    ff_mult=ff_mult,
+                    conv_expansion_factor=conv_expansion_factor,
+                    ff_dropout=dropout,
+                    attn_dropout=dropatt,
+                    conv_dropout=dropconv,
+                    conv_kernel_size=conv_kernel_size,
+                )
+            )
+        self.motion_projection = nn.Conv1d(hidden_channels, output_channels, 1)
+    def forward(self, x, mask):
+        x = self.in_projection(x)
+        x = rearrange(x, "b c t -> b t c")
+        mask = rearrange(mask, "b 1 t -> b (1 t)").bool()
+        for layer in self.layers:
+            x = layer(x, mask)
+        x = rearrange(x, "b t c -> b c t")
+        x = self.motion_projection(x)
+        return x

diff_ttsg/models/components/transformer.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from conformer.conformer import Attention as RelAttention
+from einops import rearrange
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super().__init__()
+        self.demb = demb
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.matmul(torch.unsqueeze(pos_seq, -1), torch.unsqueeze(self.inv_freq, 0))
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=1)
+        if bsz is not None:
+            return pos_emb[None, :, :].expand(bsz, -1, -1)
+        else:
+            return pos_emb[None, :, :]
+class PositionwiseConvFF(nn.Module):
+    def __init__(self, d_model, d_inner, kernel_size, dropout, pre_lnorm=False):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+        self.CoreNet = nn.Sequential(
+            nn.Conv1d(d_model, d_inner, kernel_size, 1, (kernel_size // 2)),
+            nn.ReLU(),
+            # nn.Dropout(dropout),  # worse convergence
+            nn.Conv1d(d_inner, d_model, kernel_size, 1, (kernel_size // 2)),
+            nn.Dropout(dropout),
+        )
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.pre_lnorm = pre_lnorm
+    def forward(self, inp):
+        return self._forward(inp)
+    def _forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            # core_out = inp
+            core_out = self.CoreNet(self.layer_norm(inp).transpose(1, 2))
+            core_out = core_out.transpose(1, 2)
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = inp.transpose(1, 2)
+            core_out = self.CoreNet(core_out)
+            core_out = core_out.transpose(1, 2)
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out).to(inp.dtype)
+        return output
+class MultiHeadAttn(nn.Module):
+    def __init__(
+        self, n_head, d_model, d_head, dropout, rel_attention, dropatt=0.1, pre_lnorm=True, rel_window_size=10
+    ):
+        super().__init__()
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.scale = 1 / (d_head**0.5)
+        self.pre_lnorm = pre_lnorm
+        self.rel_attention = rel_attention
+        if rel_attention:
+            self.attn = RelAttention(d_model, n_head, d_head, dropout, max_pos_emb=rel_window_size)
+        else:
+            self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head)
+            self.drop = nn.Dropout(dropout)
+            self.dropatt = nn.Dropout(dropatt)
+            self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+        self.layer_norm = nn.LayerNorm(d_model)
+    def forward(self, inp, attn_mask=None):
+        return self._forward(inp, attn_mask)
+    def _forward(self, inp, attn_mask=None):
+        residual = inp
+        if self.pre_lnorm:
+            # layer normalization
+            inp = self.layer_norm(inp)
+        if not self.rel_attention:
+            n_head, d_head = self.n_head, self.d_head
+            head_q, head_k, head_v = torch.chunk(self.qkv_net(inp), 3, dim=2)
+            head_q = head_q.view(inp.size(0), inp.size(1), n_head, d_head)
+            head_k = head_k.view(inp.size(0), inp.size(1), n_head, d_head)
+            head_v = head_v.view(inp.size(0), inp.size(1), n_head, d_head)
+            q = head_q.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+            k = head_k.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+            v = head_v.permute(2, 0, 1, 3).reshape(-1, inp.size(1), d_head)
+            attn_score = torch.bmm(q, k.transpose(1, 2))
+            attn_score.mul_(self.scale)
+            if attn_mask is not None:
+                attn_mask = attn_mask.unsqueeze(1).to(attn_score.dtype)
+                attn_mask = attn_mask.repeat(n_head, attn_mask.size(2), 1)
+                attn_score.masked_fill_(attn_mask.to(torch.bool), -float("inf"))
+            attn_prob = F.softmax(attn_score, dim=2)
+            attn_prob = self.dropatt(attn_prob)
+            attn_vec = torch.bmm(attn_prob, v)
+            attn_vec = attn_vec.view(n_head, inp.size(0), inp.size(1), d_head)
+            attn_vec = attn_vec.permute(1, 2, 0, 3).contiguous().view(inp.size(0), inp.size(1), n_head * d_head)
+            # linear projection
+            attn_out = self.o_net(attn_vec)
+            attn_out = self.drop(attn_out)
+        else:
+            attn_out = self.attn(inp, mask=attn_mask)
+        if self.pre_lnorm:
+            # residual connection
+            output = residual + attn_out
+        else:
+            # residual connection + layer normalization
+            output = self.layer_norm(residual + attn_out)
+        output = output.to(attn_out.dtype)
+        return output
+class TransformerLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, kernel_size, dropout, **kwargs):
+        super().__init__()
+        self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
+        self.pos_ff = PositionwiseConvFF(d_model, d_inner, kernel_size, dropout, pre_lnorm=kwargs.get("pre_lnorm"))
+    def forward(self, dec_inp, mask=None):
+        output = self.dec_attn(dec_inp, attn_mask=~mask.squeeze(2))
+        output *= mask
+        output = self.pos_ff(output)
+        output *= mask
+        return output
+class FFTransformer(nn.Module):
+    def __init__(
+        self,
+        n_layer,
+        n_head,
+        hidden_channels,
+        d_head,
+        d_inner,
+        kernel_size,
+        dropout,
+        dropatt,
+        dropemb=0.0,
+        embed_input=False,
+        n_embed=None,
+        d_embed=None,
+        padding_idx=0,
+        pre_lnorm=True,
+        rel_attention=True,
+        rel_window_size=10,
+    ):
+        super().__init__()
+        self.d_model = hidden_channels
+        self.n_head = n_head
+        self.d_head = d_head
+        self.padding_idx = padding_idx
+        if embed_input:
+            self.word_emb = nn.Embedding(n_embed, d_embed or hidden_channels, padding_idx=self.padding_idx)
+        else:
+            self.word_emb = None
+        self.rel_attention = rel_attention
+        if not rel_attention:
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        self.drop = nn.Dropout(dropemb)
+        self.layers = nn.ModuleList()
+        for _ in range(n_layer):
+            self.layers.append(
+                TransformerLayer(
+                    n_head,
+                    hidden_channels,
+                    d_head,
+                    d_inner,
+                    kernel_size,
+                    dropout,
+                    dropatt=dropatt,
+                    pre_lnorm=pre_lnorm,
+                    rel_attention=rel_attention,
+                    rel_window_size=rel_window_size,
+                )
+            )
+    def forward(self, dec_inp, mask=None, conditioning=0):
+        inp = dec_inp.transpose(1, 2)
+        mask = mask.bool().squeeze(1).unsqueeze(2)
+        # if self.word_emb is None:
+        #     inp = dec_inp
+        #     mask = sequence_mask(seq_lens, inp.shape[1], device=seq_lens.device, dtype=seq_lens.dtype).unsqueeze(2)
+        # else:
+        #     inp = self.word_emb(dec_inp)
+        #     # [bsz x L x 1]
+        #     mask = (dec_inp != self.padding_idx).unsqueeze(2)
+        if not self.rel_attention:
+            pos_seq = torch.arange(inp.size(1), device=inp.device).to(inp.dtype)
+            pos_emb = self.pos_emb(pos_seq) * mask
+        else:
+            pos_emb = 0
+        out = self.drop(inp + pos_emb + conditioning)
+        for layer in self.layers:
+            out = layer(out, mask=mask)
+        # out = self.drop(out)
+        return rearrange(out, "b l h -> b h l")

diff_ttsg/models/diff_ttsg.py ADDED Viewed

	@@ -0,0 +1,376 @@

+import math
+import random
+from typing import Any
+import torch
+from lightning import LightningModule
+import diff_ttsg.utils.monotonic_align as monotonic_align
+from diff_ttsg import utils
+from diff_ttsg.models.components.diffusion import Diffusion, Diffusion_Motion
+from diff_ttsg.models.components.text_encoder import (MuMotionEncoder,
+                                                      TextEncoder)
+from diff_ttsg.utils.model import (denormalize, duration_loss,
+                                   fix_len_compatibility, generate_path,
+                                   sequence_mask)
+from diff_ttsg.utils.utils import plot_tensor
+log = utils.get_pylogger(__name__)
+class Diff_TTSG(LightningModule):
+    def __init__(
+        self,
+        n_vocab,
+        n_spks,
+        spk_emb_dim,
+        n_enc_channels,
+        filter_channels,
+        filter_channels_dp,
+        n_heads,
+        n_enc_layers,
+        enc_kernel,
+        enc_dropout,
+        window_size,
+        n_feats,
+        n_motions,
+        dec_dim,
+        beta_min,
+        beta_max,
+        pe_scale,
+        mu_motion_encoder_params,
+        motion_reduction_factor,
+        motion_decoder_channels,
+        data_statistics,
+        out_size,
+        only_speech=False,
+        encoder_type="default",
+        optimizer=None
+    ):
+        super(Diff_TTSG, self).__init__()
+        self.save_hyperparameters(logger=False)
+        self.n_vocab = n_vocab
+        self.n_spks = n_spks
+        self.spk_emb_dim = spk_emb_dim
+        self.n_enc_channels = n_enc_channels
+        self.filter_channels = filter_channels
+        self.filter_channels_dp = filter_channels_dp
+        self.n_heads = n_heads
+        self.n_enc_layers = n_enc_layers
+        self.enc_kernel = enc_kernel
+        self.enc_dropout = enc_dropout
+        self.window_size = window_size
+        self.n_feats = n_feats
+        self.n_motions = n_motions
+        self.dec_dim = dec_dim
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.pe_scale = pe_scale
+        self.generate_motion = not only_speech
+        self.motion_reduction_factor = motion_reduction_factor
+        self.out_size = out_size
+        self.mu_diffusion_channels = motion_decoder_channels
+        if n_spks > 1:
+            self.spk_emb = torch.nn.Embedding(n_spks, spk_emb_dim)
+        self.encoder = TextEncoder(n_vocab, n_feats, n_enc_channels,
+                                   filter_channels, filter_channels_dp, n_heads,
+                                   n_enc_layers, enc_kernel, enc_dropout, window_size, encoder_type=encoder_type)
+        self.decoder = Diffusion(n_feats, dec_dim, n_spks, spk_emb_dim, beta_min, beta_max, pe_scale)
+        if self.generate_motion:
+            self.motion_prior_loss = mu_motion_encoder_params.pop('prior_loss', True)
+            self.mu_motion_encoder = MuMotionEncoder(
+                input_channels=n_feats,
+                output_channels=n_motions,
+                **mu_motion_encoder_params
+            )
+            self.decoder_motion = Diffusion_Motion(
+                    in_channels=n_motions,
+                    motion_decoder_channels=motion_decoder_channels,
+                    beta_min=beta_min,
+                    beta_max=beta_max,
+            )
+        self.update_data_statistics(data_statistics)
+    def update_data_statistics(self, data_statistics):
+        if data_statistics is None:
+            data_statistics = {
+                'mel_mean': 0.0,
+                'mel_std': 1.0,
+                'motion_mean': 0.0,
+                'motion_std': 1.0,
+            }
+        self.register_buffer('mel_mean', torch.tensor(data_statistics['mel_mean']))
+        self.register_buffer('mel_std', torch.tensor(data_statistics['mel_std']))
+        self.register_buffer('motion_mean', torch.tensor(data_statistics['motion_mean']))
+        self.register_buffer('motion_std', torch.tensor(data_statistics['motion_std']))
+    @torch.inference_mode()
+    def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, stoc=False, spk=None, length_scale=1.0):
+        """
+        Generates mel-spectrogram from text. Returns:
+            1. encoder outputs
+            2. decoder outputs
+            3. generated alignment
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+            x_lengths (torch.Tensor): lengths of texts in batch.
+            n_timesteps (int): number of steps to use for reverse diffusion in decoder.
+            temperature (float, optional): controls variance of terminal distribution.
+            stoc (bool, optional): flag that adds stochastic term to the decoder sampler.
+                Usually, does not provide synthesis improvements.
+            length_scale (float, optional): controls speech pace.
+                Increase value to slow down generated speech and vice versa.
+        """
+        if isinstance(n_timesteps, dict):
+            n_timestep_mel = n_timesteps['mel']
+            n_timestep_motion = n_timesteps['motion']
+        else:
+            n_timestep_mel = n_timesteps
+            n_timestep_motion = n_timesteps
+        if isinstance(temperature, dict):
+            temperature_mel = temperature['mel']
+            temperature_motion = temperature['motion']
+        else:
+            temperature_mel = temperature
+            temperature_motion = temperature
+        if self.n_spks > 1:
+            # Get speaker embedding
+            spk = self.spk_emb(spk)
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk)
+        w = torch.exp(logw) * x_mask
+        w_ceil = torch.ceil(w) * length_scale
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_max_length = int(y_lengths.max())
+        y_max_length_ = fix_len_compatibility(y_max_length)
+        # Using obtained durations `w` construct alignment map `attn`
+        y_mask = sequence_mask(y_lengths, y_max_length_).unsqueeze(1).to(x_mask.dtype)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+        # Align encoded text and get mu_y
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        encoder_outputs = mu_y[:, :, :y_max_length]
+        # Sample latent representation from terminal distribution N(mu_y, I)
+        z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature_mel
+        # Generate sample by performing reverse dynamics
+        decoder_outputs = self.decoder(z, y_mask, mu_y, n_timestep_mel, stoc, spk)
+        decoder_outputs = decoder_outputs[:, :, :y_max_length]
+        if self.generate_motion:
+            mu_y_motion = mu_y[:, :, ::self.motion_reduction_factor]
+            y_motion_mask = y_mask[:, :, ::self.motion_reduction_factor]
+            mu_y_motion = self.mu_motion_encoder(mu_y_motion, y_motion_mask)
+            encoder_outputs_motion = mu_y_motion[:, :, :y_max_length]
+            # sample latent representation from terminal distribution N(mu_y_motion, I)
+            z_motion = mu_y_motion + torch.randn_like(mu_y_motion, device=mu_y_motion.device) / temperature_motion
+            # Generate sample by performing reverse dynamics
+            decoder_outputs_motion = self.decoder_motion(z_motion, y_motion_mask, mu_y_motion, n_timestep_motion, stoc, spk)
+            decoder_outputs_motion = decoder_outputs_motion[:, :, :y_max_length]
+        else:
+            decoder_outputs_motion = None
+            encoder_outputs_motion = None
+        return {
+            'encoder_outputs_mel': encoder_outputs,
+            'decoder_outputs_mel': decoder_outputs,
+            'encoder_outputs_motion': encoder_outputs_motion,
+            'decoder_outputs_motion': decoder_outputs_motion,
+            'attn': attn[:, :, :y_max_length],
+            'mel': denormalize(decoder_outputs, self.mel_mean, self.mel_std),
+            'motion': denormalize(decoder_outputs_motion, self.motion_mean, self.motion_std) if self.generate_motion else None,
+        }
+    def forward(self, x, x_lengths, y, y_lengths, y_motion, spk=None, out_size=None):
+        """
+        Computes 3 losses:
+            1. duration loss: loss between predicted token durations and those extracted by Monotinic Alignment Search (MAS).
+            2. prior loss: loss between mel-spectrogram and encoder outputs.
+            3. diffusion loss: loss between gaussian noise and its reconstruction by diffusion-based decoder.
+        Args:
+            x (torch.Tensor): batch of texts, converted to a tensor with phoneme embedding ids.
+            x_lengths (torch.Tensor): lengths of texts in batch.
+            y (torch.Tensor): batch of corresponding mel-spectrograms.
+            y_lengths (torch.Tensor): lengths of mel-spectrograms in batch.
+            out_size (int, optional): length (in mel's sampling rate) of segment to cut, on which decoder will be trained.
+                Should be divisible by 2^{num of UNet downsamplings}. Needed to increase batch size.
+        """
+        if self.n_spks > 1:
+            # Get speaker embedding
+            spk = self.spk_emb(spk)
+        # Get encoder_outputs `mu_x` and log-scaled token durations `logw`
+        mu_x, logw, x_mask = self.encoder(x, x_lengths, spk)
+        y_max_length = y.shape[-1]
+        y_mask = sequence_mask(y_lengths, y_max_length).unsqueeze(1).to(x_mask)
+        attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2)
+        # Use MAS to find most likely alignment `attn` between text and mel-spectrogram
+        with torch.no_grad():
+            const = -0.5 * math.log(2 * math.pi) * self.n_feats
+            factor = -0.5 * torch.ones(mu_x.shape, dtype=mu_x.dtype, device=mu_x.device)
+            y_square = torch.matmul(factor.transpose(1, 2), y ** 2)
+            y_mu_double = torch.matmul(2.0 * (factor * mu_x).transpose(1, 2), y)
+            mu_square = torch.sum(factor * (mu_x ** 2), 1).unsqueeze(-1)
+            log_prior = y_square - y_mu_double + mu_square + const
+            attn = monotonic_align.maximum_path(log_prior, attn_mask.squeeze(1))
+            attn = attn.detach()
+        # Compute loss between predicted log-scaled durations and those obtained from MAS
+        logw_ = torch.log(1e-8 + torch.sum(attn.unsqueeze(1), -1)) * x_mask
+        dur_loss = duration_loss(logw, logw_, x_lengths)
+        # Cut a small segment of mel-spectrogram in order to increase batch size
+        if not isinstance(out_size, type(None)):
+            max_offset = (y_lengths - out_size).clamp(0)    # cut a random segment of size `out_size` from each sample in batch max_offset: [758, 160, 773]
+            offset_ranges = list(zip([0] * max_offset.shape[0], max_offset.cpu().numpy()))  # offset ranges for each sample in batch offset_ranges: [(0, 758), (0, 160), (0, 773)]
+            out_offset = torch.LongTensor([
+                torch.tensor(random.choice(range(start, end)) if end > start else 0)
+                for start, end in offset_ranges
+            ]).to(y_lengths)
+            attn_cut = torch.zeros(attn.shape[0], attn.shape[1], out_size, dtype=attn.dtype, device=attn.device)
+            y_cut = torch.zeros(y.shape[0], self.n_feats, out_size, dtype=y.dtype, device=y.device)
+            if self.generate_motion:
+                y_motion_cut = torch.zeros(y_motion.shape[0], self.n_motions, out_size, dtype=y_motion.dtype, device=y_motion.device)
+            y_cut_lengths = []
+            for i, (y_, out_offset_) in enumerate(zip(y, out_offset)):
+                y_cut_length = out_size + (y_lengths[i] - out_size).clamp(None, 0)
+                y_cut_lengths.append(y_cut_length)
+                cut_lower, cut_upper = out_offset_, out_offset_ + y_cut_length
+                y_cut[i, :, :y_cut_length] = y_[:, cut_lower:cut_upper]
+                if self.generate_motion:
+                    y_motion_cut[i, :, :y_cut_length] = y_motion[i, :, cut_lower:cut_upper]
+                attn_cut[i, :, :y_cut_length] = attn[i, :, cut_lower:cut_upper]
+            y_cut_lengths = torch.LongTensor(y_cut_lengths)
+            y_cut_mask = sequence_mask(y_cut_lengths).unsqueeze(1).to(y_mask)
+            attn = attn_cut
+            y = y_cut
+            if self.generate_motion:
+                y_motion = y_motion_cut
+            y_mask = y_cut_mask
+        # Align encoded text with mel-spectrogram and get mu_y segment
+        mu_y = torch.matmul(attn.squeeze(1).transpose(1, 2), mu_x.transpose(1, 2))
+        mu_y = mu_y.transpose(1, 2)
+        # Compute loss of score-based decoder
+        diff_loss, xt = self.decoder.compute_loss(y, y_mask, mu_y, spk)
+        if self.generate_motion:
+            # Reduce motion features
+            mu_y_motion = mu_y[:, :, ::self.motion_reduction_factor]
+            y_motion_mask = y_mask[:, :, ::self.motion_reduction_factor]
+            y_motion = y_motion[:, :, ::self.motion_reduction_factor]
+            mu_y_motion = self.mu_motion_encoder(mu_y_motion, y_motion_mask)
+            diff_loss_motion, xt_motion = self.decoder_motion.compute_loss(y_motion, y_motion_mask, mu_y_motion, spk)
+        else:
+            diff_loss_motion = 0
+        # Compute loss between aligned encoder outputs and mel-spectrogram
+        prior_loss = torch.sum(0.5 * ((y - mu_y) ** 2 + math.log(2 * math.pi)) * y_mask)
+        prior_loss = prior_loss / (torch.sum(y_mask) * self.n_feats)
+        if self.generate_motion and self.motion_prior_loss:
+            prior_loss_motion = torch.sum(0.5 * ((y_motion - mu_y_motion) ** 2 + math.log(2 * math.pi)) * y_motion_mask)
+            prior_loss_motion = prior_loss_motion / (torch.sum(y_motion_mask) * self.n_motions)
+        else:
+            prior_loss_motion = 0
+        return dur_loss, prior_loss + prior_loss_motion, diff_loss + diff_loss_motion
+    def configure_optimizers(self) -> Any:
+        optimizer = self.hparams.optimizer(params=self.parameters())
+        return {'optimizer': optimizer}
+    def get_losses(self, batch):
+        pass
+        x, x_lengths = batch['x'], batch['x_lengths']
+        y, y_lengths = batch['y'], batch['y_lengths']
+        y_motion = batch['y_motion']
+        dur_loss, prior_loss, diff_loss = self(x, x_lengths, y, y_lengths, y_motion, out_size=self.out_size)
+        return {
+            'dur_loss': dur_loss,
+            'prior_loss': prior_loss,
+            'diff_loss': diff_loss,
+        }
+    def training_step(self, batch: Any, batch_idx: int):
+        loss_dict = self.get_losses(batch)
+        self.log('step', float(self.global_step), on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        self.log('sub_loss/train_dur_loss', loss_dict['dur_loss'], on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        self.log('sub_loss/train_prior_loss', loss_dict['prior_loss'], on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        self.log('sub_loss/train_diff_loss', loss_dict['diff_loss'], on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        total_loss = sum(loss_dict.values())
+        self.log('loss/train', total_loss, on_step=True, on_epoch=True, logger=True, prog_bar=True, sync_dist=True)
+        return {'loss': total_loss, 'log': loss_dict }
+    def validation_step(self, batch: Any, batch_idx: int):
+        loss_dict = self.get_losses(batch)
+        self.log('sub_loss/val_dur_loss', loss_dict['dur_loss'], on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        self.log('sub_loss/val_prior_loss', loss_dict['prior_loss'], on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        self.log('sub_loss/val_diff_loss', loss_dict['diff_loss'], on_step=True, on_epoch=True, logger=True, sync_dist=True)
+        total_loss = sum(loss_dict.values())
+        self.log('loss/val', total_loss, on_step=True, on_epoch=True, logger=True, prog_bar=True, sync_dist=True)
+        return total_loss
+    def on_validation_end(self) -> None:
+        if self.trainer.is_global_zero:
+            one_batch = next(iter(self.trainer.val_dataloaders))
+            if self.current_epoch == 0:
+                log.debug("Plotting original samples")
+                for i in range(4):
+                    y = one_batch['y'][i].unsqueeze(0).to(self.device)
+                    y_motion = one_batch['y_motion'][i].unsqueeze(0).to(self.device)
+                    self.logger.experiment.add_image(f'original/mel_{i}', plot_tensor(y.squeeze().cpu()), self.current_epoch, dataformats='HWC')
+                    if self.generate_motion:
+                        self.logger.experiment.add_image(f'original/mel_{i}', plot_tensor(y_motion.squeeze().cpu()), self.current_epoch, dataformats='HWC')
+            log.debug(f'Synthesising...')
+            for i in range(4):
+                x = one_batch['x'][i].unsqueeze(0).to(self.device)
+                x_lengths = one_batch['x_lengths'][i].unsqueeze(0).to(self.device)
+                output = self.synthesise(x, x_lengths, n_timesteps=20)
+                y_enc, y_dec = output['encoder_outputs_mel'], output['decoder_outputs_mel']
+                y_motion_enc, y_motion_dec, attn = output['encoder_outputs_motion'], output['decoder_outputs_motion'], output['attn']
+                self.logger.experiment.add_image(f'generated_enc/{i}', plot_tensor(y_enc.squeeze().cpu()), self.current_epoch, dataformats='HWC')
+                self.logger.experiment.add_image(f'generated_dec/{i}', plot_tensor(y_dec.squeeze().cpu()), self.current_epoch, dataformats='HWC')
+                if self.generate_motion:
+                    self.logger.experiment.add_image(f'generated_enc_motion/{i}', plot_tensor(y_motion_enc.squeeze().cpu()), self.current_epoch, dataformats='HWC')
+                    self.logger.experiment.add_image(f'generated_dec_motion/{i}', plot_tensor(y_motion_dec.squeeze().cpu()), self.current_epoch, dataformats='HWC')
+                self.logger.experiment.add_image(f'alignment/{i}', plot_tensor(attn.squeeze().cpu()), self.current_epoch, dataformats='HWC')

diff_ttsg/models/mnist_module.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from typing import Any
+import torch
+from lightning import LightningModule
+from torchmetrics import MaxMetric, MeanMetric
+from torchmetrics.classification.accuracy import Accuracy
+class MNISTLitModule(LightningModule):
+    """Example of LightningModule for MNIST classification.
+    A LightningModule organizes your PyTorch code into 6 sections:
+        - Initialization (__init__)
+        - Train Loop (training_step)
+        - Validation loop (validation_step)
+        - Test loop (test_step)
+        - Prediction Loop (predict_step)
+        - Optimizers and LR Schedulers (configure_optimizers)
+    Docs:
+        https://lightning.ai/docs/pytorch/latest/common/lightning_module.html
+    """
+    def __init__(
+        self,
+        net: torch.nn.Module,
+        optimizer: torch.optim.Optimizer,
+        scheduler: torch.optim.lr_scheduler,
+    ):
+        super().__init__()
+        # this line allows to access init params with 'self.hparams' attribute
+        # also ensures init params will be stored in ckpt
+        self.save_hyperparameters(logger=False)
+        self.net = net
+        # loss function
+        self.criterion = torch.nn.CrossEntropyLoss()
+        # metric objects for calculating and averaging accuracy across batches
+        self.train_acc = Accuracy(task="multiclass", num_classes=10)
+        self.val_acc = Accuracy(task="multiclass", num_classes=10)
+        self.test_acc = Accuracy(task="multiclass", num_classes=10)
+        # for averaging loss across batches
+        self.train_loss = MeanMetric()
+        self.val_loss = MeanMetric()
+        self.test_loss = MeanMetric()
+        # for tracking best so far validation accuracy
+        self.val_acc_best = MaxMetric()
+    def forward(self, x: torch.Tensor):
+        return self.net(x)
+    def on_train_start(self):
+        # by default lightning executes validation step sanity checks before training starts,
+        # so it's worth to make sure validation metrics don't store results from these checks
+        self.val_loss.reset()
+        self.val_acc.reset()
+        self.val_acc_best.reset()
+    def model_step(self, batch: Any):
+        x, y = batch
+        logits = self.forward(x)
+        loss = self.criterion(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        return loss, preds, y
+    def training_step(self, batch: Any, batch_idx: int):
+        loss, preds, targets = self.model_step(batch)
+        # update and log metrics
+        self.train_loss(loss)
+        self.train_acc(preds, targets)
+        self.log("train/loss", self.train_loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log("train/acc", self.train_acc, on_step=False, on_epoch=True, prog_bar=True)
+        # return loss or backpropagation will fail
+        return loss
+    def on_train_epoch_end(self):
+        pass
+    def validation_step(self, batch: Any, batch_idx: int):
+        loss, preds, targets = self.model_step(batch)
+        # update and log metrics
+        self.val_loss(loss)
+        self.val_acc(preds, targets)
+        self.log("val/loss", self.val_loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log("val/acc", self.val_acc, on_step=False, on_epoch=True, prog_bar=True)
+    def on_validation_epoch_end(self):
+        acc = self.val_acc.compute()  # get current val acc
+        self.val_acc_best(acc)  # update best so far val acc
+        # log `val_acc_best` as a value through `.compute()` method, instead of as a metric object
+        # otherwise metric would be reset by lightning after each epoch
+        self.log("val/acc_best", self.val_acc_best.compute(), sync_dist=True, prog_bar=True)
+    def test_step(self, batch: Any, batch_idx: int):
+        loss, preds, targets = self.model_step(batch)
+        # update and log metrics
+        self.test_loss(loss)
+        self.test_acc(preds, targets)
+        self.log("test/loss", self.test_loss, on_step=False, on_epoch=True, prog_bar=True)
+        self.log("test/acc", self.test_acc, on_step=False, on_epoch=True, prog_bar=True)
+    def on_test_epoch_end(self):
+        pass
+    def configure_optimizers(self):
+        """Choose what optimizers and learning-rate schedulers to use in your optimization.
+        Normally you'd need one. But in the case of GANs or similar you might have multiple.
+        Examples:
+            https://lightning.ai/docs/pytorch/latest/common/lightning_module.html#configure-optimizers
+        """
+        optimizer = self.hparams.optimizer(params=self.parameters())
+        if self.hparams.scheduler is not None:
+            scheduler = self.hparams.scheduler(optimizer=optimizer)
+            return {
+                "optimizer": optimizer,
+                "lr_scheduler": {
+                    "scheduler": scheduler,
+                    "monitor": "val/loss",
+                    "interval": "epoch",
+                    "frequency": 1,
+                },
+            }
+        return {"optimizer": optimizer}
+if __name__ == "__main__":
+    _ = MNISTLitModule(None, None, None)

diff_ttsg/resources/cmu_dictionary ADDED Viewed

The diff for this file is too large to render. See raw diff

diff_ttsg/text/LICENSE ADDED Viewed

	@@ -0,0 +1,30 @@

+CMUdict
+-------
+CMUdict (the Carnegie Mellon Pronouncing Dictionary) is a free
+pronouncing dictionary of English, suitable for uses in speech
+technology and is maintained by the Speech Group in the School of
+Computer Science at Carnegie Mellon University.
+The Carnegie Mellon Speech Group does not guarantee the accuracy of
+this dictionary, nor its suitability for any specific purpose. In
+fact, we expect a number of errors, omissions and inconsistencies to
+remain in the dictionary. We intend to continually update the
+dictionary by correction existing entries and by adding new ones. From
+time to time a new major version will be released.
+We welcome input from users: Please send email to Alex Rudnicky
+([email protected]).
+The Carnegie Mellon Pronouncing Dictionary, in its current and
+previous versions is Copyright (C) 1993-2014 by Carnegie Mellon
+University.  Use of this dictionary for any research or commercial
+purpose is completely unrestricted.  If you make use of or
+redistribute this material we request that you acknowledge its
+origin in your descriptions.
+If you add words to or correct words in your version of this
+dictionary, we would appreciate it if you could send these additions
+and corrections to us ([email protected]) for consideration in a
+subsequent version. All submissions will be reviewed and approved by
+the current maintainer, Alex Rudnicky at Carnegie Mellon.

diff_ttsg/text/__init__.py ADDED Viewed

	@@ -0,0 +1,96 @@

+""" from https://github.com/keithito/tacotron """
+import re
+from diff_ttsg.text import cleaners
+from diff_ttsg.text.symbols import symbols
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
+def get_arpabet(word, dictionary):
+    word_arpabet = dictionary.lookup(word)
+    if word_arpabet is not None:
+        return "{" + word_arpabet[0] + "}"
+    else:
+        return word
+def text_to_sequence(text, cleaner_names=["english_cleaners"], dictionary=None):
+    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    Args:
+      text: string to convert to a sequence
+      cleaner_names: names of the cleaner functions to run the text through
+      dictionary: arpabet class with arpabet dictionary
+    Returns:
+      List of integers corresponding to the symbols in the text
+    '''
+    sequence = []
+    space = _symbols_to_sequence(' ')
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            clean_text = _clean_text(text, cleaner_names)
+            if dictionary is not None:
+                clean_text = [get_arpabet(w, dictionary) for w in clean_text.split(" ")]
+                for i in range(len(clean_text)):
+                    t = clean_text[i]
+                    if t.startswith("{"):
+                        sequence += _arpabet_to_sequence(t[1:-1])
+                    else:
+                        sequence += _symbols_to_sequence(t)
+                    sequence += space
+            else:
+                sequence += _symbols_to_sequence(clean_text)
+            break
+        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)
+    # remove trailing space
+    if dictionary is not None:
+        sequence = sequence[:-1] if sequence[-1] == space[0] else sequence
+    return sequence
+def sequence_to_text(sequence):
+    '''Converts a sequence of IDs back to a string'''
+    result = ''
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == '@':
+                s = '{%s}' % s[1:]
+            result += s
+    return result.replace('}{', ' ')
+def _clean_text(text, cleaner_names):
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception('Unknown cleaner: %s' % name)
+        text = cleaner(text)
+    return text
+def _symbols_to_sequence(symbols):
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+def _arpabet_to_sequence(text):
+    return _symbols_to_sequence(['@' + s for s in text.split()])
+def _should_keep_symbol(s):
+    return s in _symbol_to_id and s != '_' and s != '~'

diff_ttsg/text/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (3.41 kB). View file

diff_ttsg/text/__pycache__/cleaners.cpython-310.pyc ADDED Viewed

Binary file (1.98 kB). View file

diff_ttsg/text/__pycache__/cmudict.cpython-310.pyc ADDED Viewed

Binary file (2.22 kB). View file

diff_ttsg/text/__pycache__/numbers.cpython-310.pyc ADDED Viewed

Binary file (2.22 kB). View file

diff_ttsg/text/__pycache__/symbols.cpython-310.pyc ADDED Viewed

Binary file (604 Bytes). View file

diff_ttsg/text/cleaners.py ADDED Viewed

	@@ -0,0 +1,73 @@

+""" from https://github.com/keithito/tacotron """
+import re
+from unidecode import unidecode
+from .numbers import normalize_numbers
+_whitespace_re = re.compile(r'\s+')
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def expand_numbers(text):
+    return normalize_numbers(text)
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, ' ', text)
+def convert_to_ascii(text):
+    return unidecode(text)
+def basic_cleaners(text):
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def transliteration_cleaners(text):
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def english_cleaners(text):
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text

diff_ttsg/text/cmudict.py ADDED Viewed

	@@ -0,0 +1,60 @@

+""" from https://github.com/keithito/tacotron """
+import re
+valid_symbols = [
+  'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+  'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+  'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+  'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+  'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+  'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+  'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+]
+_valid_symbol_set = set(valid_symbols)
+class CMUDict:
+    def __init__(self, file_or_path, keep_ambiguous=True):
+        if isinstance(file_or_path, str):
+            with open(file_or_path, encoding='latin-1') as f:
+                entries = _parse_cmudict(f)
+        else:
+            entries = _parse_cmudict(file_or_path)
+        if not keep_ambiguous:
+            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+        self._entries = entries
+    def __len__(self):
+        return len(self._entries)
+    def lookup(self, word):
+        return self._entries.get(word.upper())
+_alt_re = re.compile(r'\([0-9]+\)')
+def _parse_cmudict(file):
+    cmudict = {}
+    for line in file:
+        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
+            parts = line.split('  ')
+            word = re.sub(_alt_re, '', parts[0])
+            pronunciation = _get_pronunciation(parts[1])
+            if pronunciation:
+                if word in cmudict:
+                    cmudict[word].append(pronunciation)
+                else:
+                    cmudict[word] = [pronunciation]
+    return cmudict
+def _get_pronunciation(s):
+    parts = s.strip().split(' ')
+    for part in parts:
+        if part not in _valid_symbol_set:
+            return None
+    return ' '.join(parts)

diff_ttsg/text/numbers.py ADDED Viewed

	@@ -0,0 +1,72 @@

+""" from https://github.com/keithito/tacotron """
+import inflect
+import re
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
+_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_number_re = re.compile(r'[0-9]+')
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split('.')
+    if len(parts) > 2:
+        return match + ' dollars'
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        return '%s %s' % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return '%s %s' % (cents, cent_unit)
+    else:
+        return 'zero dollars'
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return 'two thousand'
+        elif num > 2000 and num < 2010:
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + ' hundred'
+        else:
+            return _inflect.number_to_words(num, andword='', zero='oh',
+                                            group=2).replace(', ', ' ')
+    else:
+        return _inflect.number_to_words(num, andword='')
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text