diff --git a/.github/workflows/code_formatter.yml b/.github/workflows/code_formatter.yml new file mode 100644 index 0000000000000000000000000000000000000000..4018b926c81be906a1ac75e25c8acb3e28cf8953 --- /dev/null +++ b/.github/workflows/code_formatter.yml @@ -0,0 +1,51 @@ +name: Code Formatter + +on: + push: + branches: + - main + +jobs: + push_format: + runs-on: ubuntu-latest + + permissions: + contents: write + pull-requests: write + + steps: + - uses: actions/checkout@v4 + with: + ref: ${{github.ref_name}} + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Black + run: pip install "black[jupyter]" + + - name: Run Black + # run: black $(git ls-files '*.py') + run: black . --exclude=".*\.ipynb$" + + - name: Commit Back + continue-on-error: true + id: commitback + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add --all + git commit -m "chore(format): run black on ${{github.ref_name}}" + + - name: Create Pull Request + if: steps.commitback.outcome == 'success' + continue-on-error: true + uses: peter-evans/create-pull-request@v5 + with: + delete-branch: true + body: "Automatically apply code formatter change" + title: "chore(format): run black on ${{github.ref_name}}" + commit-message: "chore(format): run black on ${{github.ref_name}}" + branch: formatter/${{github.ref_name}} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..30c5cdf5a7ebbea06185ccd6a0fbccb4ef09c2a8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,166 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# mine +.flac +.pth +.pt \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..464c43964b7b11d5dd201929087a37a3f5a4f076 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,25 @@ + + +## KindaHex Non-Commercial Use License (HNCU) + + +This repository is licensed under the **KindaHex Non-Commercial Use License (HNCU)**. By using, modifying, or distributing any content from this repository, you agree to the terms outlined below. + +### Terms of Use: +1. **Non-Commercial Use Only**: You are permitted to use, modify, and distribute the contents of this repository **only for non-commercial purposes**. Commercial use, including selling, licensing, or distributing for profit, is strictly prohibited. + +2. **Modification and Derivative Works**: You may modify the contents of this repository and create derivative works. However, any modification or derivative work must also adhere to the non-commercial restriction and be subject to the terms of this license. + +3. **Attribution**: When using or distributing the content (either as-is or modified), you must provide proper attribution to the original creator (blane187gt) in a manner that is reasonable and customary for the medium. + +4. **No Warranty**: The content in this repository is provided "as-is," without any warranty, express or implied, including but not limited to warranties of merchantability or fitness for a particular purpose. + +5. **Compliance with Laws**: You are responsible for ensuring that your use of the content complies with all applicable laws and regulations. + +6. **Termination**: If you violate any of the terms of this license, your rights to use the repository’s content will be automatically terminated. You must cease all use and distribution of the content immediately upon termination. + +### Restrictions: +- You may **not** use this repository's content for commercial gain, including but not limited to creating products, services, or tools that are sold or monetized. +- You may **not** sublicense or transfer rights to third parties for commercial purposes. +- You may not use the content in any manner that competes with the original repository or its creator. + diff --git a/README.md b/README.md index 7be5fc7f47d5db027d120b8024982df93db95b74..afd06308adafce24615c4b2ee7c3bcad5956139f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,43 @@ ---- -license: mit ---- +# HexGen RVC + + +https://colab.research.google.com/drive/1dmGS0vEWuX55Z1w1tSRD6lJDV8s2deY0?usp=sharing + +HexGen RVC is a tool designed for generating high-quality AI vocal covers using advanced source separation, vocal modeling, and audio processing techniques. This project builds on several community-driven efforts, integrating the best tools and frameworks available for music and vocal manipulation. + +## Features +- **AI-Driven Vocal Cover Generation**: Produce custom vocal covers with ease. +- **Source Separation**: Isolate instrumentals and vocals from any track. +- **Efficient Workflow**: Streamlined integration with popular tools for music processing. +- **Colab Support**: Easily deploy and test models in Google Colab environments. + +## Installation +1. Clone the repository: + ```bash + git clone https://github.com/blane187gt/hexGen-RVC.git + cd hexGen-RVC + ``` +2. Follow specific setup instructions provided in the [documentation](https://github.com/blane187gt/hexGen-RVC/wiki) (if available) or in the code comments. + +## Usage +1. Prepare your audio input file(s) and place them in the appropriate folder. +2. Run the script or Colab notebook as per the instructions. +3. Customize the output by tweaking the parameters and models used. + +## Credits +This project would not have been possible without the contributions and support of the following tools and creators: + +- [Audio Separator](https://github.com/karaokenerds/python-audio-separator) by [Andrew Beveridge](https://github.com/beveradb) +- [Applio](https://github.com/IAHispano/Applio) by [IAHispano](https://github.com/IAHispano) +- [yt-dlp](https://github.com/yt-dlp/yt-dlp) +- [Ultimate Vocal Remover GUI](https://github.com/Anjok07/ultimatevocalremovergui) by [Anjok07](https://github.com/Anjok07) +- [Music Source Separation Universal Training Code](https://github.com/ZFTurbo/Music-Source-Separation-Training) by [ZFTurbo](https://github.com/ZFTurbo) +- [AICoverGen](https://github.com/SociallyIneptWeeb/AICoverGen) by [SociallyIneptWeeb](https://github.com/SociallyIneptWeeb) +- [FullmatheusBallZ](https://www.youtube.com/@FullmatheusBallZ) for testing the Colab scripts. +- [Shirou](https://github.com/ShiromiyaG), the original project inspiration. + +## Contributing +Feel free to submit pull requests or create issues for any improvements or bugs you encounter. Contributions are always welcome! + +## License +This project is licensed under the terms specified in the `LICENSE` file. Ensure compliance with third-party dependencies when using or modifying this project. diff --git a/assets/config.json b/assets/config.json new file mode 100644 index 0000000000000000000000000000000000000000..abff5c4435aa39bfab87a7fdd696d1f950bb17bc --- /dev/null +++ b/assets/config.json @@ -0,0 +1,6 @@ +{ + "lang": { + "override": false, + "selected_lang": "en_US" + } +} diff --git a/assets/i18n/i18n.py b/assets/i18n/i18n.py new file mode 100644 index 0000000000000000000000000000000000000000..c980c5b24f7338aa869217e089365b39e8086af8 --- /dev/null +++ b/assets/i18n/i18n.py @@ -0,0 +1,52 @@ +import os, sys +import json +from pathlib import Path +from locale import getdefaultlocale + +now_dir = os.getcwd() +sys.path.append(now_dir) + + +class I18nAuto: + LANGUAGE_PATH = os.path.join(now_dir, "assets", "i18n", "languages") + + def __init__(self, language=None): + with open( + os.path.join(now_dir, "assets", "config.json"), "r", encoding="utf8" + ) as file: + config = json.load(file) + override = config["lang"]["override"] + lang_prefix = config["lang"]["selected_lang"] + + self.language = lang_prefix + + if override == False: + language = language or getdefaultlocale()[0] + lang_prefix = language[:2] if language is not None else "en" + available_languages = self._get_available_languages() + matching_languages = [ + lang for lang in available_languages if lang.startswith(lang_prefix) + ] + self.language = matching_languages[0] if matching_languages else "en_US" + + self.language_map = self._load_language_list() + + def _load_language_list(self): + try: + file_path = Path(self.LANGUAGE_PATH) / f"{self.language}.json" + with open(file_path, "r", encoding="utf-8") as file: + return json.load(file) + except FileNotFoundError: + raise FileNotFoundError( + f"Failed to load language file for {self.language}. Check if the correct .json file exists." + ) + + def _get_available_languages(self): + language_files = [path.stem for path in Path(self.LANGUAGE_PATH).glob("*.json")] + return language_files + + def _language_exists(self, language): + return (Path(self.LANGUAGE_PATH) / f"{language}.json").exists() + + def __call__(self, key): + return self.language_map.get(key, key) diff --git a/assets/i18n/languages/en_US.json b/assets/i18n/languages/en_US.json new file mode 100644 index 0000000000000000000000000000000000000000..e424eac690d81fcbff45d28e313bf3a865531600 --- /dev/null +++ b/assets/i18n/languages/en_US.json @@ -0,0 +1,89 @@ +{ + "Voice Model": "Voice Model", + "Select the voice model to use for the conversion.": "Select the voice model to use for the conversion.", + "Index File": "Index File", + "Select the index file to use for the conversion.": "Select the index file to use for the conversion.", + "Refresh": "Refresh", + "Unload Voice": "Unload Voice", + "Upload Audio": "Upload Audio", + "Select Audio": "Select Audio", + "Select the audio to convert.": "Select the audio to convert.", + "Advanced Settings": "Advanced Settings", + "RVC Settings": "RVC Settings", + "Output Path": "Output Path", + "Enter output path": "Enter output path", + "The path where the output audio will be saved, by default in audio_files/rvc/output.wav": "The path where the output audio will be saved, by default in audio_files/rvc/output.wav", + "Clear Outputs (Deletes all audios in assets/audios)": "Clear Outputs (Deletes all audios in assets/audios)", + "Export Format": "Export Format", + "Select the format to export the audio.": "Select the format to export the audio.", + "Split Audio": "Split Audio", + "Split the audio into chunks for inference to obtain better results in some cases.": "Split the audio into chunks for inference to obtain better results in some cases.", + "Pitch Extractor": "Pitch Extractor", + "Pitch extract Algorith.": "Pitch extract Algorith.", + "Hop Length": "Hop Length", + "Hop length for pitch extraction.": "Hop length for pitch extraction.", + "Embedder Model": "Embedder Model", + "Model used for learning speaker embedding.": "Model used for learning speaker embedding.", + "Autotune": "Autotune", + "Apply a soft autotune to your inferences, recommended for singing conversions.": "Apply a soft autotune to your inferences, recommended for singing conversions.", + "Pitch": "Pitch", + "Adjust the pitch of the audio.": "Adjust the pitch of the audio.", + "Filter Radius": "Filter Radius", + "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.": "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.", + "Search Feature Ratio": "Search Feature Ratio", + "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.": "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.", + "Volume Envelope": "Volume Envelope", + "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.": "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.", + "Protect Voiceless Consonants": "Protect Voiceless Consonants", + "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.": "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.", + "Audio Separation Settings": "Audio Separation Settings", + "Use TTA": "Use TTA", + "Use Test Time Augmentation.": "Use Test Time Augmentation.", + "Batch Size": "Batch Size", + "Set the batch size for the separation.": "Set the batch size for the separation.", + "Vocals Model": "Vocals Model", + "Select the vocals model to use for the separation.": "Select the vocals model to use for the separation.", + "Karaoke Model": "Karaoke Model", + "Select the karaoke model to use for the separation.": "Select the karaoke model to use for the separation.", + "Dereverb Model": "Dereverb Model", + "Select the dereverb model to use for the separation.": "Select the dereverb model to use for the separation.", + "Deeecho": "Deeecho", + "Apply deeecho to the audio.": "Apply deeecho to the audio.", + "Deeecho Model": "Deeecho Model", + "Select the deeecho model to use for the separation.": "Select the deeecho model to use for the separation.", + "Denoise": "Denoise", + "Apply denoise to the audio.": "Apply denoise to the audio.", + "Denoise Model": "Denoise Model", + "Select the denoise model to use for the separation.": "Select the denoise model to use for the separation.", + "Audio post-process Settings": "Audio post-process Settings", + "Delete Audios": "Delete Audios", + "Delete the audios after the conversion.": "Delete the audios after the conversion.", + "Reverb": "Reverb", + "Apply reverb to the audio.": "Apply reverb to the audio.", + "Reverb Room Size": "Reverb Room Size", + "Set the room size of the reverb.": "Set the room size of the reverb.", + "Reverb Damping": "Reverb Damping", + "Set the damping of the reverb.": "Set the damping of the reverb.", + "Reverb Wet Gain": "Reverb Wet Gain", + "Set the wet gain of the reverb.": "Set the wet gain of the reverb.", + "Reverb Dry Gain": "Reverb Dry Gain", + "Set the dry gain of the reverb.": "Set the dry gain of the reverb.", + "Reverb Width": "Reverb Width", + "Set the width of the reverb.": "Set the width of the reverb.", + "Vocals Volume": "Vocals Volume", + "Adjust the volume of the vocals.": "Adjust the volume of the vocals.", + "Instrumentals Volume": "Instrumentals Volume", + "Adjust the volume of the Instrumentals.": "Adjust the volume of the Instrumentals.", + "Backing Vocals Volume": "Backing Vocals Volume", + "Adjust the volume of the backing vocals.": "Adjust the volume of the backing vocals.", + "Device Settings": "Device Settings", + "Device": "Device", + "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -": "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -", + "Convert": "Convert", + "Output Information": "Output Information", + "The output information will be displayed here.": "The output information will be displayed here.", + "Export Audio": "Export Audio", + "Music URL": "Music URL", + "Download": "Download", + "Model URL": "Model URL" +} diff --git a/assets/i18n/languages/pt_BR.json b/assets/i18n/languages/pt_BR.json new file mode 100644 index 0000000000000000000000000000000000000000..eba846e135d41b76ff91cd45c7c0bbd4e23c800e --- /dev/null +++ b/assets/i18n/languages/pt_BR.json @@ -0,0 +1,89 @@ +{ + "Voice Model": "Modelo de Voz", + "Select the voice model to use for the conversion.": "Selecione o modelo de voz a ser usado para a conversão.", + "Index File": "Arquivo Index", + "Select the index file to use for the conversion.": "Selecione o arquivo Index a ser usado para a conversão.", + "Refresh": "Atualizar", + "Unload Voice": "Descarregar Voz", + "Upload Audio": "Carregar Áudio", + "Select Audio": "Selecionar Áudio", + "Select the audio to convert.": "Selecione o áudio a ser convertido.", + "Advanced Settings": "Configurações Avançadas", + "RVC Settings": "Configurações RVC", + "Output Path": "Caminho de Saída", + "Enter output path": "Insira o caminho de saída", + "The path where the output audio will be saved, by default in audio_files/rvc/output.wav": "O caminho onde o áudio de saída será salvo, por padrão em audio_files/rvc/output.wav", + "Clear Outputs (Deletes all audios in assets/audios)": "Limpar Saídas (Exclui todos os áudios em assets/audios)", + "Export Format": "Formato de Exportação", + "Select the format to export the audio.": "Selecione o formato para exportar o áudio.", + "Split Audio": "Dividir Áudio", + "Split the audio into chunks for inference to obtain better results in some cases.": "Divida o áudio em partes para inferência para obter melhores resultados em alguns casos.", + "Pitch Extractor": "Extrator de Pitch", + "Pitch extract Algorith.": "Algoritmo de Extração de Pitch", + "Hop Length": "Hop Length", + "Hop length for pitch extraction.": "Hop Length para extração de pitch.", + "Embedder Model": "Modelo de Embedding", + "Model used for learning speaker embedding.": "Modelo usado para aprendizado de embedding de locutor.", + "Autotune": "Autotune", + "Apply a soft autotune to your inferences, recommended for singing conversions.": "Aplique um autotune suave às suas inferências, recomendado para conversões de canto.", + "Pitch": "Pitch", + "Adjust the pitch of the audio.": "Ajuste o pitch do áudio.", + "Filter Radius": "Raio do Filtro", + "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration.": "Se o número for maior ou igual a três, o uso de filtragem mediana nos resultados de tom coletados tem o potencial de diminuir a respiração.", + "Search Feature Ratio": "Proporção da Função de Busca", + "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio.": "Influência exercida pelo arquivo de índice; um valor mais alto corresponde a maior influência. No entanto, optar por valores mais baixos pode ajudar a mitigar artefatos presentes no áudio.", + "Volume Envelope": "Envelope de Volume", + "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed.": "Substitua ou misture com o envelope de volume da saída. Quanto mais próximo o valor estiver de 1, mais o envelope de saída será empregado.", + "Protect Voiceless Consonants": "Proteger Consoantes Surdas", + "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect.": "Proteja consoantes distintas e sons de respiração para evitar rasgos eletroacústicos e outros artefatos. Ajustar o parâmetro para seu valor máximo de 0,5 oferece proteção abrangente. No entanto, reduzir esse valor pode diminuir a extensão da proteção enquanto potencialmente mitiga o efeito de indexação.", + "Audio Separation Settings": "Configurações de Separação de Áudio", + "Use TTA": "Usar TTA", + "Use Test Time Augmentation.": "Usar Aumento de Tempo de Teste.", + "Batch Size": "Batch Size", + "Set the batch size for the separation.": "Defina o Batch Size para a separação.", + "Vocals Model": "Modelo de Vocais", + "Select the vocals model to use for the separation.": "Selecione o modelo de vocais a ser usado para a separação.", + "Karaoke Model": "Modelo de Karaokê", + "Select the karaoke model to use for the separation.": "Selecione o modelo de karaokê a ser usado para a separação.", + "Dereverb Model": "Modelo de Dereverb", + "Select the dereverb model to use for the separation.": "Selecione o modelo de dereverb a ser usado para a separação.", + "Deeecho": "Deeecho", + "Apply deeecho to the audio.": "Aplicar deeecho ao áudio.", + "Deeecho Model": "Modelo de Deeecho", + "Select the deeecho model to use for the separation.": "Selecione o modelo de deeecho a ser usado para a separação.", + "Denoise": "Redução de Ruído", + "Apply denoise to the audio.": "Aplicar redução de ruído ao áudio.", + "Denoise Model": "Modelo de Redução de Ruído", + "Select the denoise model to use for the separation.": "Selecione o modelo de redução de ruído a ser usado para a separação.", + "Audio post-process Settings": "Configurações de Pós-processamento de Áudio", + "Delete Audios": "Excluir Áudios", + "Delete the audios after the conversion.": "Excluir os áudios após a conversão.", + "Reverb": "Reverberação", + "Apply reverb to the audio.": "Aplicar reverberação ao áudio.", + "Reverb Room Size": "Tamanho da Sala de Reverberação", + "Set the room size of the reverb.": "Definir o tamanho da sala de reverberação.", + "Reverb Damping": "Amortecimento da Reverberação", + "Set the damping of the reverb.": "Definir o amortecimento da reverberação.", + "Reverb Wet Gain": "Ganho Molhado da Reverberação", + "Set the wet gain of the reverb.": "Definir o ganho molhado da reverberação.", + "Reverb Dry Gain": "Ganho Seco da Reverberação", + "Set the dry gain of the reverb.": "Definir o ganho seco da reverberação.", + "Reverb Width": "Largura da Reverberação", + "Set the width of the reverb.": "Definir a largura da reverberação.", + "Vocals Volume": "Volume dos Vocais", + "Adjust the volume of the vocals.": "Ajustar o volume dos vocais.", + "Instrumentals Volume": "Volume dos Instrumentais", + "Adjust the volume of the Instrumentals.": "Ajustar o volume dos instrumentais.", + "Backing Vocals Volume": "Volume dos Vocais de Apoio", + "Adjust the volume of the backing vocals.": "Ajustar o volume dos vocais de apoio.", + "Device Settings": "Configurações do Dispositivo", + "Device": "Dispositivo", + "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -": "Selecione o dispositivo a ser usado para a conversão. 0 a ∞ separados por - e para CPU deixe apenas um -", + "Convert": "Converter", + "Output Information": "Informações de Saída", + "The output information will be displayed here.": "As informações de saída serão exibidas aqui.", + "Export Audio": "Exportar Áudio", + "Music URL": "URL da Música", + "Download": "Baixar", + "Model URL": "URL do Modelo" +} diff --git a/assets/i18n/scan.py b/assets/i18n/scan.py new file mode 100644 index 0000000000000000000000000000000000000000..7cd584fa2af8480f443c51417c442fecf5197d11 --- /dev/null +++ b/assets/i18n/scan.py @@ -0,0 +1,71 @@ +import ast +import json +from pathlib import Path +from collections import OrderedDict + + +def extract_i18n_strings(node): + i18n_strings = [] + + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "i18n" + ): + for arg in node.args: + if isinstance(arg, ast.Str): + i18n_strings.append(arg.s) + + for child_node in ast.iter_child_nodes(node): + i18n_strings.extend(extract_i18n_strings(child_node)) + + return i18n_strings + + +def process_file(file_path): + with open(file_path, "r", encoding="utf8") as file: + code = file.read() + if "I18nAuto" in code: + tree = ast.parse(code) + i18n_strings = extract_i18n_strings(tree) + print(file_path, len(i18n_strings)) + return i18n_strings + return [] + + +# Use pathlib for file handling +py_files = Path(".").rglob("*.py") + +# Use a set to store unique strings +code_keys = set() + +for py_file in py_files: + strings = process_file(py_file) + code_keys.update(strings) + +print() +print("Total unique:", len(code_keys)) + +standard_file = "languages/en_US.json" +with open(standard_file, "r", encoding="utf-8") as file: + standard_data = json.load(file, object_pairs_hook=OrderedDict) +standard_keys = set(standard_data.keys()) + +# Combine unused and missing keys sections +unused_keys = standard_keys - code_keys +missing_keys = code_keys - standard_keys + +print("Unused keys:", len(unused_keys)) +for unused_key in unused_keys: + print("\t", unused_key) + +print("Missing keys:", len(missing_keys)) +for missing_key in missing_keys: + print("\t", missing_key) + +code_keys_dict = OrderedDict((s, s) for s in code_keys) + +# Use context manager for writing back to the file +with open(standard_file, "w", encoding="utf-8") as file: + json.dump(code_keys_dict, file, ensure_ascii=False, indent=4, sort_keys=True) + file.write("\n") diff --git a/core.py b/core.py new file mode 100644 index 0000000000000000000000000000000000000000..7c66580d3733e50598e6a15178f700d05fdb6868 --- /dev/null +++ b/core.py @@ -0,0 +1,1023 @@ +import sys, os +import subprocess +import torch +from functools import lru_cache +import shutil +from pedalboard import Pedalboard, Reverb +from pedalboard.io import AudioFile +from pydub import AudioSegment +from audio_separator.separator import Separator +import logging +import yaml + +now_dir = os.getcwd() +sys.path.append(now_dir) +from programs.applio_code.rvc.infer.infer import VoiceConverter +from programs.applio_code.rvc.lib.tools.model_download import model_download_pipeline +from programs.music_separation_code.inference import proc_file + +models_vocals = [ + { + "name": "Mel-Roformer by KimberleyJSN", + "path": os.path.join(now_dir, "models", "mel-vocals"), + "model": os.path.join(now_dir, "models", "mel-vocals", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mel-vocals", "config.yaml"), + "type": "mel_band_roformer", + "config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml", + "model_url": "https://huggingface.co/KimberleyJSN/melbandroformer/resolve/main/MelBandRoformer.ckpt", + }, + { + "name": "BS-Roformer by ViperX", + "path": os.path.join(now_dir, "models", "bs-vocals"), + "model": os.path.join(now_dir, "models", "bs-vocals", "model.ckpt"), + "config": os.path.join(now_dir, "models", "bs-vocals", "config.yaml"), + "type": "bs_roformer", + "config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml", + "model_url": "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt", + }, + { + "name": "MDX23C", + "path": os.path.join(now_dir, "models", "mdx23c-vocals"), + "model": os.path.join(now_dir, "models", "mdx23c-vocals", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mdx23c-vocals", "config.yaml"), + "type": "mdx23c", + "config_url": "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/config_vocals_mdx23c.yaml", + "model_url": "https://github.com/ZFTurbo/Music-Source-Separation-Training/releases/download/v1.0.0/model_vocals_mdx23c_sdr_10.17.ckpt", + }, +] + +karaoke_models = [ + { + "name": "Mel-Roformer Karaoke by aufr33 and viperx", + "path": os.path.join(now_dir, "models", "mel-kara"), + "model": os.path.join(now_dir, "models", "mel-kara", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mel-kara", "config.yaml"), + "type": "mel_band_roformer", + "config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel_band_roformer_karaoke_aufr33_viperx/config_mel_band_roformer_karaoke.yaml", + "model_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel_band_roformer_karaoke_aufr33_viperx/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt", + }, + { + "name": "UVR-BVE", + "full_name": "UVR-BVE-4B_SN-44100-1.pth", + "arch": "vr", + }, +] + +denoise_models = [ + { + "name": "Mel-Roformer Denoise Normal by aufr33", + "path": os.path.join(now_dir, "models", "mel-denoise"), + "model": os.path.join(now_dir, "models", "mel-denoise", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mel-denoise", "config.yaml"), + "type": "mel_band_roformer", + "config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel-denoise/model_mel_band_roformer_denoise.yaml", + "model_url": "https://huggingface.co/jarredou/aufr33_MelBand_Denoise/resolve/main/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt", + }, + { + "name": "Mel-Roformer Denoise Aggressive by aufr33", + "path": os.path.join(now_dir, "models", "mel-denoise-aggr"), + "model": os.path.join(now_dir, "models", "mel-denoise-aggr", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mel-denoise-aggr", "config.yaml"), + "type": "mel_band_roformer", + "config_url": "https://huggingface.co/shiromiya/audio-separation-models/resolve/main/mel-denoise/model_mel_band_roformer_denoise.yaml", + "model_url": "https://huggingface.co/jarredou/aufr33_MelBand_Denoise/resolve/main/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt", + }, + { + "name": "UVR Denoise", + "full_name": "UVR-DeNoise.pth", + "arch": "vr", + }, +] + +dereverb_models = [ + { + "name": "MDX23C DeReverb by aufr33 and jarredou", + "path": os.path.join(now_dir, "models", "mdx23c-dereveb"), + "model": os.path.join(now_dir, "models", "mdx23c-dereveb", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mdx23c-dereveb", "config.yaml"), + "type": "mdx23c", + "config_url": "https://huggingface.co/jarredou/aufr33_jarredou_MDXv3_DeReverb/resolve/main/config_dereverb_mdx23c.yaml", + "model_url": "https://huggingface.co/jarredou/aufr33_jarredou_MDXv3_DeReverb/resolve/main/dereverb_mdx23c_sdr_6.9096.ckpt", + }, + { + "name": "BS-Roformer Dereverb by anvuew", + "path": os.path.join(now_dir, "models", "mdx23c-dereveb"), + "model": os.path.join(now_dir, "models", "mdx23c-dereveb", "model.ckpt"), + "config": os.path.join(now_dir, "models", "mdx23c-dereveb", "config.yaml"), + "type": "bs_roformer", + "config_url": "https://huggingface.co/anvuew/deverb_bs_roformer/resolve/main/deverb_bs_roformer_8_384dim_10depth.yaml", + "model_url": "https://huggingface.co/anvuew/deverb_bs_roformer/resolve/main/deverb_bs_roformer_8_384dim_10depth.ckpt", + }, + { + "name": "UVR-Deecho-Dereverb", + "full_name": "UVR-DeEcho-DeReverb.pth", + "arch": "vr", + }, + { + "name": "MDX Reverb HQ by FoxJoy", + "full_name": "Reverb_HQ_By_FoxJoy.onnx", + "arch": "mdx", + }, +] + +deecho_models = [ + { + "name": "UVR-Deecho-Normal", + "full_name": "UVR-De-Echo-Normal.pth", + "arch": "vr", + }, + { + "name": "UVR-Deecho-Agggressive", + "full_name": "UVR-De-Echo-Aggressive.pth", + "arch": "vr", + }, +] + + +@lru_cache(maxsize=None) +def import_voice_converter(): + from programs.applio_code.rvc.infer.infer import VoiceConverter + + return VoiceConverter() + + +@lru_cache(maxsize=1) +def get_config(): + from programs.applio_code.rvc.configs.config import Config + + return Config() + + +def download_file(url, path, filename): + os.makedirs(path, exist_ok=True) + file_path = os.path.join(path, filename) + + if os.path.exists(file_path): + print(f"File '{filename}' already exists at '{path}'.") + return + + try: + response = torch.hub.download_url_to_file(url, file_path) + print(f"File '{filename}' downloaded successfully") + except Exception as e: + print(f"Error downloading file '{filename}' from '{url}': {e}") + + +def get_model_info_by_name(model_name): + all_models = ( + models_vocals + + karaoke_models + + dereverb_models + + deecho_models + + denoise_models + ) + for model in all_models: + if model["name"] == model_name: + return model + return None + + +def get_last_modified_file(pasta): + if not os.path.isdir(pasta): + raise NotADirectoryError(f"{pasta} is not a valid directory.") + arquivos = [f for f in os.listdir(pasta) if os.path.isfile(os.path.join(pasta, f))] + if not arquivos: + return None + return max(arquivos, key=lambda x: os.path.getmtime(os.path.join(pasta, x))) + + +def search_with_word(folder, word): + if not os.path.isdir(folder): + raise NotADirectoryError(f"{folder} is not a valid directory.") + file_with_word = [file for file in os.listdir(folder) if word in file] + if not file_with_word: + return None + most_recent_file = max( + file_with_word, key=lambda file: os.path.getmtime(os.path.join(folder, file)) + ) + return most_recent_file + + +def search_with_two_words(folder, word1, word2): + if not os.path.isdir(folder): + raise NotADirectoryError(f"{folder} is not a valid directory.") + file_with_words = [ + file for file in os.listdir(folder) if word1 in file and word2 in file + ] + if not file_with_words: + return None + most_recent_file = max( + file_with_words, key=lambda file: os.path.getmtime(os.path.join(folder, file)) + ) + return most_recent_file + + +def get_last_modified_folder(path): + directories = [ + os.path.join(path, d) + for d in os.listdir(path) + if os.path.isdir(os.path.join(path, d)) + ] + if not directories: + return None + last_modified_folder = max(directories, key=os.path.getmtime) + return last_modified_folder + + +def add_audio_effects( + audio_path, + reverb_size, + reverb_wet, + reverb_dry, + reverb_damping, + reverb_width, + output_path, +): + board = Pedalboard([]) + board.append( + Reverb( + room_size=reverb_size, + dry_level=reverb_dry, + wet_level=reverb_wet, + damping=reverb_damping, + width=reverb_width, + ) + ) + with AudioFile(audio_path) as f: + with AudioFile(output_path, "w", f.samplerate, f.num_channels) as o: + while f.tell() < f.frames: + chunk = f.read(int(f.samplerate)) + effected = board(chunk, f.samplerate, reset=False) + o.write(effected) + return output_path + + +def merge_audios( + vocals_path, + inst_path, + backing_path, + output_path, + main_gain, + inst_gain, + backing_Vol, + output_format, +): + main_vocal_audio = AudioSegment.from_file(vocals_path, format="flac") + main_gain + instrumental_audio = AudioSegment.from_file(inst_path, format="flac") + inst_gain + backing_vocal_audio = ( + AudioSegment.from_file(backing_path, format="flac") + backing_Vol + ) + combined_audio = main_vocal_audio.overlay( + instrumental_audio.overlay(backing_vocal_audio) + ) + combined_audio.export(output_path, format=output_format) + return output_path + + +def check_fp16_support(device): + i_device = int(str(device).split(":")[-1]) + gpu_name = torch.cuda.get_device_name(i_device) + low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"] + if any(gpu in gpu_name for gpu in low_end_gpus) and "V100" not in gpu_name.upper(): + print(f"Your GPU {gpu_name} not support FP16 inference. Using FP32 instead.") + return False + return True + + +def full_inference_program( + model_path, + index_path, + input_audio_path, + output_path, + export_format_rvc, + split_audio, + autotune, + vocal_model, + karaoke_model, + dereverb_model, + deecho, + deecho_model, + denoise, + denoise_model, + reverb, + vocals_volume, + instrumentals_volume, + backing_vocals_volume, + export_format_final, + devices, + pitch, + filter_radius, + index_rate, + rms_mix_rate, + protect, + pitch_extract, + hop_lenght, + reverb_room_size, + reverb_damping, + reverb_wet_gain, + reverb_dry_gain, + reverb_width, + embedder_model, + delete_audios, + use_tta, + batch_size, + infer_backing_vocals, + infer_backing_vocals_model, + infer_backing_vocals_index, + change_inst_pitch, + pitch_back, + filter_radius_back, + index_rate_back, + rms_mix_rate_back, + protect_back, + pitch_extract_back, + hop_length_back, + export_format_rvc_back, + split_audio_back, + autotune_back, + embedder_model_back, +): + if torch.cuda.is_available(): + n_gpu = torch.cuda.device_count() + devices = devices.replace("-", " ") + print(f"Number of GPUs available: {n_gpu}") + first_device = devices.split()[0] + fp16 = check_fp16_support(first_device) + else: + devices = "cpu" + print("Using CPU") + fp16 = False + + music_folder = os.path.splitext(os.path.basename(input_audio_path))[0] + + # Vocals Separation + model_info = get_model_info_by_name(vocal_model) + model_ckpt_path = os.path.join(model_info["path"], "model.ckpt") + if not os.path.exists(model_ckpt_path): + download_file( + model_info["model_url"], + model_info["path"], + "model.ckpt", + ) + config_json_path = os.path.join(model_info["path"], "config.yaml") + if not os.path.exists(config_json_path): + download_file( + model_info["config_url"], + model_info["path"], + "config.yaml", + ) + if not fp16: + with open(model_info["config"], "r") as file: + config = yaml.safe_load(file) + + config["training"]["use_amp"] = False + + with open(model_info["config"], "w") as file: + yaml.safe_dump(config, file) + store_dir = os.path.join(now_dir, "audio_files", music_folder, "vocals") + inst_dir = os.path.join(now_dir, "audio_files", music_folder, "instrumentals") + os.makedirs(store_dir, exist_ok=True) + os.makedirs(inst_dir, exist_ok=True) + input_audio_basename = os.path.splitext(os.path.basename(input_audio_path))[0] + search_result = search_with_word(store_dir, "vocals") + if search_result: + print("Vocals already separated"), + else: + print("Separating vocals") + command = [ + "python", + os.path.join(now_dir, "programs", "music_separation_code", "inference.py"), + "--model_type", + model_info["type"], + "--config_path", + model_info["config"], + "--start_check_point", + model_info["model"], + "--input_file", + input_audio_path, + "--store_dir", + store_dir, + "--flac_file", + "--pcm_type", + "PCM_16", + "--extract_instrumental", + ] + + if devices == "cpu": + command.append("--force_cpu") + else: + device_ids = [str(int(device)) for device in devices.split()] + command.extend(["--device_ids"] + device_ids) + + subprocess.run(command) + os.rename( + os.path.join( + store_dir, + search_with_two_words( + store_dir, + os.path.basename(input_audio_path).split(".")[0], + "instrumental", + ), + ), + os.path.join( + inst_dir, + f"{os.path.basename(input_audio_path).split('.')[0]}_instrumentals.flac", + ), + ) + inst_file = os.path.join( + inst_dir, + search_with_two_words( + inst_dir, os.path.basename(input_audio_path).split(".")[0], "instrumentals" + ), + ) + + # karaoke separation + model_info = get_model_info_by_name(karaoke_model) + store_dir = os.path.join(now_dir, "audio_files", music_folder, "karaoke") + os.makedirs(store_dir, exist_ok=True) + vocals_path = os.path.join(now_dir, "audio_files", music_folder, "vocals") + input_file = search_with_word(vocals_path, "vocals") + karaoke_exists = search_with_word(store_dir, "karaoke") is not None + + if karaoke_exists: + print("Backing vocals already separated") + else: + if input_file: + input_file = os.path.join(vocals_path, input_file) + print("Separating Backing vocals") + if model_info["name"] == "Mel-Roformer Karaoke by aufr33 and viperx": + model_ckpt_path = os.path.join(model_info["path"], "model.ckpt") + if not os.path.exists(model_ckpt_path): + download_file( + model_info["model_url"], + model_info["path"], + "model.ckpt", + ) + config_json_path = os.path.join(model_info["path"], "config.yaml") + if not os.path.exists(config_json_path): + download_file( + model_info["config_url"], + model_info["path"], + "config.yaml", + ) + if not fp16: + with open(model_info["config"], "r") as file: + config = yaml.safe_load(file) + + config["training"]["use_amp"] = False + + with open(model_info["config"], "w") as file: + yaml.safe_dump(config, file) + + command = [ + "python", + os.path.join( + now_dir, "programs", "music_separation_code", "inference.py" + ), + "--model_type", + model_info["type"], + "--config_path", + model_info["config"], + "--start_check_point", + model_info["model"], + "--input_file", + input_file, + "--store_dir", + store_dir, + "--flac_file", + "--pcm_type", + "PCM_16", + "--extract_instrumental", + ] + + if devices == "cpu": + command.append("--force_cpu") + else: + device_ids = [str(int(device)) for device in devices.split()] + command.extend(["--device_ids"] + device_ids) + + subprocess.run(command) + else: + separator = Separator( + model_file_dir=os.path.join(now_dir, "models", "karaoke"), + log_level=logging.WARNING, + normalization_threshold=1.0, + output_format="flac", + output_dir=store_dir, + vr_params={ + "batch_size": batch_size, + "enable_tta": use_tta, + }, + ) + separator.load_model(model_filename=model_info["full_name"]) + separator.separate(input_file) + karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke") + vocals_result = search_with_two_words( + karaoke_path, + os.path.basename(input_audio_path).split(".")[0], + "Vocals", + ) + instrumental_result = search_with_two_words( + karaoke_path, + os.path.basename(input_audio_path).split(".")[0], + "Instrumental", + ) + if "UVR-BVE-4B_SN-44100-1" in os.path.basename(vocals_result): + os.rename( + os.path.join(karaoke_path, vocals_result), + os.path.join( + karaoke_path, + f"{os.path.basename(input_audio_path).split('.')[0]}_karaoke.flac", + ), + ) + if "UVR-BVE-4B_SN-44100-1" in os.path.basename(instrumental_result): + os.rename( + os.path.join(karaoke_path, instrumental_result), + os.path.join( + karaoke_path, + f"{os.path.basename(input_audio_path).split('.')[0]}_instrumental.flac", + ), + ) + + # dereverb + model_info = get_model_info_by_name(dereverb_model) + store_dir = os.path.join(now_dir, "audio_files", music_folder, "dereverb") + os.makedirs(store_dir, exist_ok=True) + karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke") + input_file = search_with_word(karaoke_path, "karaoke") + noreverb_exists = search_with_word(store_dir, "noreverb") is not None + if noreverb_exists: + print("Reverb already removed") + else: + if input_file: + input_file = os.path.join(karaoke_path, input_file) + print("Removing reverb") + if ( + model_info["name"] == "BS-Roformer Dereverb by anvuew" + or model_info["name"] == "MDX23C DeReverb by aufr33 and jarredou" + ): + model_ckpt_path = os.path.join(model_info["path"], "model.ckpt") + if not os.path.exists(model_ckpt_path): + download_file( + model_info["model_url"], + model_info["path"], + "model.ckpt", + ) + config_json_path = os.path.join(model_info["path"], "config.yaml") + if not os.path.exists(config_json_path): + download_file( + model_info["config_url"], + model_info["path"], + "config.yaml", + ) + if not fp16: + with open(model_info["config"], "r") as file: + config = yaml.safe_load(file) + + config["training"]["use_amp"] = False + + with open(model_info["config"], "w") as file: + yaml.safe_dump(config, file) + command = [ + "python", + os.path.join( + now_dir, "programs", "music_separation_code", "inference.py" + ), + "--model_type", + model_info["type"], + "--config_path", + model_info["config"], + "--start_check_point", + model_info["model"], + "--input_file", + input_file, + "--store_dir", + store_dir, + "--flac_file", + "--pcm_type", + "PCM_16", + ] + + if devices == "cpu": + command.append("--force_cpu") + else: + device_ids = [str(int(device)) for device in devices.split()] + command.extend(["--device_ids"] + device_ids) + + subprocess.run(command) + else: + if model_info["arch"] == "vr": + separator = Separator( + model_file_dir=os.path.join(now_dir, "models", "dereverb"), + log_level=logging.WARNING, + normalization_threshold=1.0, + output_format="flac", + output_dir=store_dir, + output_single_stem="No Reverb", + vr_params={ + "batch_size": batch_size, + "enable_tta": use_tta, + }, + ) + else: + separator = Separator( + model_file_dir=os.path.join(now_dir, "models", "dereverb"), + log_level=logging.WARNING, + normalization_threshold=1.0, + output_format="flac", + output_dir=store_dir, + output_single_stem="No Reverb", + ) + separator.load_model(model_filename=model_info["full_name"]) + separator.separate(input_file) + dereverb_path = os.path.join( + now_dir, "audio_files", music_folder, "dereverb" + ) + search_result = search_with_two_words( + dereverb_path, + os.path.basename(input_audio_path).split(".")[0], + "No Reverb", + ) + if "UVR-DeEcho-DeReverb" in os.path.basename( + search_result + ) or "MDX Reverb HQ by FoxJoy" in os.path.basename(search_result): + os.rename( + os.path.join(dereverb_path, search_result), + os.path.join( + dereverb_path, + f"{os.path.basename(input_audio_path).split('.')[0]}_noreverb.flac", + ), + ) + + # deecho + store_dir = os.path.join(now_dir, "audio_files", music_folder, "deecho") + os.makedirs(store_dir, exist_ok=True) + if deecho: + no_echo_exists = search_with_word(store_dir, "noecho") is not None + if no_echo_exists: + print("Echo already removed") + else: + print("Removing echo") + model_info = get_model_info_by_name(deecho_model) + + dereverb_path = os.path.join( + now_dir, "audio_files", music_folder, "dereverb" + ) + noreverb_file = search_with_word(dereverb_path, "noreverb") + + input_file = os.path.join(dereverb_path, noreverb_file) + + separator = Separator( + model_file_dir=os.path.join(now_dir, "models", "deecho"), + log_level=logging.WARNING, + normalization_threshold=1.0, + output_format="flac", + output_dir=store_dir, + output_single_stem="No Echo", + vr_params={ + "batch_size": batch_size, + "enable_tta": use_tta, + }, + ) + separator.load_model(model_filename=model_info["full_name"]) + separator.separate(input_file) + deecho_path = os.path.join(now_dir, "audio_files", music_folder, "deecho") + search_result = search_with_two_words( + deecho_path, + os.path.basename(input_audio_path).split(".")[0], + "No Echo", + ) + if "UVR-De-Echo-Normal" in os.path.basename( + search_result + ) or "UVR-Deecho-Agggressive" in os.path.basename(search_result): + os.rename( + os.path.join(deecho_path, search_result), + os.path.join( + deecho_path, + f"{os.path.basename(input_audio_path).split('.')[0]}_noecho.flac", + ), + ) + + # denoise + store_dir = os.path.join(now_dir, "audio_files", music_folder, "denoise") + os.makedirs(store_dir, exist_ok=True) + if denoise: + no_noise_exists = search_with_word(store_dir, "dry") is not None + if no_noise_exists: + print("Noise already removed") + else: + model_info = get_model_info_by_name(denoise_model) + print("Removing noise") + input_file = ( + os.path.join( + now_dir, + "audio_files", + music_folder, + "deecho", + search_with_word( + os.path.join(now_dir, "audio_files", music_folder, "deecho"), + "noecho", + ), + ) + if deecho + else os.path.join( + now_dir, + "audio_files", + music_folder, + "dereverb", + search_with_word( + os.path.join(now_dir, "audio_files", music_folder, "dereverb"), + "noreverb", + ), + ) + ) + + if ( + model_info["name"] == "Mel-Roformer Denoise Normal by aufr33" + or model_info["name"] == "Mel-Roformer Denoise Aggressive by aufr33" + ): + model_ckpt_path = os.path.join(model_info["path"], "model.ckpt") + if not os.path.exists(model_ckpt_path): + download_file( + model_info["model_url"], + model_info["path"], + "model.ckpt", + ) + config_json_path = os.path.join(model_info["path"], "config.yaml") + if not os.path.exists(config_json_path): + download_file( + model_info["config_url"], model_info["path"], "config.yaml" + ) + if not fp16: + with open(model_info["config"], "r") as file: + config = yaml.safe_load(file) + + config["training"]["use_amp"] = False + + with open(model_info["config"], "w") as file: + yaml.safe_dump(config, file) + command = [ + "python", + os.path.join( + now_dir, "programs", "music_separation_code", "inference.py" + ), + "--model_type", + model_info["type"], + "--config_path", + model_info["config"], + "--start_check_point", + model_info["model"], + "--input_file", + input_file, + "--store_dir", + store_dir, + "--flac_file", + "--pcm_type", + "PCM_16", + ] + + if devices == "cpu": + command.append("--force_cpu") + else: + device_ids = [str(int(device)) for device in devices.split()] + command.extend(["--device_ids"] + device_ids) + + subprocess.run(command) + else: + separator = Separator( + model_file_dir=os.path.join(now_dir, "models", "denoise"), + log_level=logging.WARNING, + normalization_threshold=1.0, + output_format="flac", + output_dir=store_dir, + output_single_stem="No Noise", + vr_params={ + "batch_size": batch_size, + "enable_tta": use_tta, + }, + ) + separator.load_model(model_filename=model_info["full_name"]) + separator.separate(input_file) + search_result = search_with_two_words( + deecho_path, + os.path.basename(input_audio_path).split(".")[0], + "No Noise", + ) + if "UVR Denoise" in os.path.basename(search_result): + os.rename( + os.path.join(deecho_path, search_result), + os.path.join( + deecho_path, + f"{os.path.basename(input_audio_path).split('.')[0]}_dry.flac", + ), + ) + + # RVC + denoise_path = os.path.join(now_dir, "audio_files", music_folder, "denoise") + deecho_path = os.path.join(now_dir, "audio_files", music_folder, "deecho") + dereverb_path = os.path.join(now_dir, "audio_files", music_folder, "dereverb") + + denoise_audio = search_with_two_words( + denoise_path, os.path.basename(input_audio_path).split(".")[0], "dry" + ) + deecho_audio = search_with_two_words( + deecho_path, os.path.basename(input_audio_path).split(".")[0], "noecho" + ) + dereverb = search_with_two_words( + dereverb_path, os.path.basename(input_audio_path).split(".")[0], "noreverb" + ) + + if denoise_audio: + final_path = os.path.join( + now_dir, "audio_files", music_folder, "denoise", denoise_audio + ) + elif deecho_audio: + final_path = os.path.join( + now_dir, "audio_files", music_folder, "deecho", deecho_audio + ) + elif dereverb: + final_path = os.path.join( + now_dir, "audio_files", music_folder, "dereverb", dereverb + ) + else: + final_path = None + + store_dir = os.path.join(now_dir, "audio_files", music_folder, "rvc") + os.makedirs(store_dir, exist_ok=True) + print("Making RVC inference") + output_rvc = os.path.join( + now_dir, + "audio_files", + music_folder, + "rvc", + f"{os.path.basename(input_audio_path).split('.')[0]}_rvc.wav", + ) + inference_vc = import_voice_converter() + inference_vc.convert_audio( + audio_input_path=final_path, + audio_output_path=output_rvc, + model_path=model_path, + index_path=index_path, + embedder_model=embedder_model, + pitch=pitch, + f0_file=None, + f0_method=pitch_extract, + filter_radius=filter_radius, + index_rate=index_rate, + volume_envelope=rms_mix_rate, + protect=protect, + split_audio=split_audio, + f0_autotune=autotune, + hop_length=hop_lenght, + export_format=export_format_rvc, + embedder_model_custom=None, + ) + backing_vocals = os.path.join( + karaoke_path, search_with_word(karaoke_path, "instrumental") + ) + + if infer_backing_vocals: + print("Infering backing vocals") + karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke") + instrumental_file = search_with_word(karaoke_path, "instrumental") + backing_vocals = os.path.join(karaoke_path, instrumental_file) + output_backing_vocals = os.path.join( + karaoke_path, f"{input_audio_basename}_instrumental_output.wav" + ) + inference_vc.convert_audio( + audio_input_path=backing_vocals, + audio_output_path=output_backing_vocals, + model_path=infer_backing_vocals_model, + index_path=infer_backing_vocals_index, + embedder_model=embedder_model_back, + pitch=pitch_back, + f0_file=None, + f0_method=pitch_extract_back, + filter_radius=filter_radius_back, + index_rate=index_rate_back, + volume_envelope=rms_mix_rate_back, + protect=protect_back, + split_audio=split_audio_back, + f0_autotune=autotune_back, + hop_length=hop_length_back, + export_format=export_format_rvc_back, + embedder_model_custom=None, + ) + backing_vocals = output_backing_vocals + + # post process + if reverb: + add_audio_effects( + os.path.join( + now_dir, + "audio_files", + music_folder, + "rvc", + get_last_modified_file( + os.path.join(now_dir, "audio_files", music_folder, "rvc") + ), + ), + reverb_room_size, + reverb_wet_gain, + reverb_dry_gain, + reverb_damping, + reverb_width, + os.path.join( + now_dir, + "audio_files", + music_folder, + "rvc", + os.path.basename(input_audio_path), + ), + ) + if change_inst_pitch != 0: + print("Changing instrumental pitch") + inst_path = os.path.join( + now_dir, + "audio_files", + music_folder, + "instrumentals", + search_with_word( + os.path.join(now_dir, "audio_files", music_folder, "instrumentals"), + "instrumentals", + ), + ) + audio = AudioSegment.from_file(inst_path) + + factor = 2 ** (change_inst_pitch / 12) + + new_frame_rate = int(audio.frame_rate * factor) + audio = audio._spawn(audio.raw_data, overrides={"frame_rate": new_frame_rate}) + + audio = audio.set_frame_rate(audio.frame_rate) + output_dir_pitch = os.path.join( + now_dir, "audio_files", music_folder, "instrumentals" + ) + output_path_pitch = os.path.join( + output_dir_pitch, "inst_with_changed_pitch.flac" + ) + audio.export(output_path_pitch, format="flac") + + # merge audios + store_dir = os.path.join(now_dir, "audio_files", music_folder, "final") + os.makedirs(store_dir, exist_ok=True) + + vocals_path = os.path.join(now_dir, "audio_files", music_folder, "rvc") + vocals_file = get_last_modified_file( + os.path.join(now_dir, "audio_files", music_folder, "rvc") + ) + vocals_file = os.path.join(vocals_path, vocals_file) + + karaoke_path = os.path.join(now_dir, "audio_files", music_folder, "karaoke") + karaoke_file = search_with_word(karaoke_path, "Instrumental") or search_with_word( + karaoke_path, "instrumental" + ) + karaoke_file = os.path.join(karaoke_path, karaoke_file) + final_output_path = os.path.join( + now_dir, + "audio_files", + music_folder, + "final", + f"{os.path.basename(input_audio_path).split('.')[0]}_final.{export_format_final.lower()}", + ) + print("Merging audios") + result = merge_audios( + vocals_file, + inst_file, + backing_vocals, + final_output_path, + vocals_volume, + instrumentals_volume, + backing_vocals_volume, + export_format_final, + ) + print("Audios merged!") + if delete_audios: + main_directory = os.path.join(now_dir, "audio_files", music_folder) + folder_to_keep = "final" + for folder_name in os.listdir(main_directory): + folder_path = os.path.join(main_directory, folder_name) + if os.path.isdir(folder_path) and folder_name != folder_to_keep: + shutil.rmtree(folder_path) + return ( + f"Audio file {os.path.basename(input_audio_path).split('.')[0]} converted with success", + result, + ) + + +def download_model(link): + model_download_pipeline(link) + return "Model downloaded with success" + + +def download_music(link): + os.makedirs(os.path.join(now_dir, "audio_files", "original_files"), exist_ok=True) + command = [ + "yt-dlp", + "-x", + "--output", + os.path.join(now_dir, "audio_files", "original_files", "%(title)s.%(ext)s"), + link, + ] + subprocess.run(command) + return "Music downloaded with success" diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..a45f7e7dd4063596187a49bb73bca5c32a6fba22 --- /dev/null +++ b/main.py @@ -0,0 +1,53 @@ +import gradio as gr +import sys, os +from tabs.full_inference import full_inference_tab +from tabs.download_model import download_model_tab + +now_dir = os.getcwd() +sys.path.append(now_dir) +DEFAULT_PORT = 7755 +MAX_PORT_ATTEMPTS = 10 + +from assets.i18n.i18n import I18nAuto + +i18n = I18nAuto() + + +with gr.Blocks(title="hexGen-RVC", css="footer{display:none !important}") as app: + gr.Markdown("# hexGen RVC") + with gr.Tab(i18n("Full Inference")): + full_inference_tab() + with gr.Tab(i18n("Download Model")): + download_model_tab() + + +def launch(port): + app.launch( + share="--share" in sys.argv, + inbrowser="--open" in sys.argv, + server_port=port, + ) + + +def get_port_from_args(): + if "--port" in sys.argv: + port_index = sys.argv.index("--port") + 1 + if port_index < len(sys.argv): + return int(sys.argv[port_index]) + return DEFAULT_PORT + + +if __name__ == "__main__": + port = get_port_from_args() + for _ in range(MAX_PORT_ATTEMPTS): + try: + launch(port) + break + except OSError: + print( + f"Failed to launch on port {port}, trying again on port {port - 1}..." + ) + port -= 1 + except Exception as error: + print(f"An error occurred launching Gradio: {error}") + break diff --git a/programs/applio_code/rvc/configs/config.py b/programs/applio_code/rvc/configs/config.py new file mode 100644 index 0000000000000000000000000000000000000000..28a3519804d2ef88cf42b779ae3c69b1c00f157d --- /dev/null +++ b/programs/applio_code/rvc/configs/config.py @@ -0,0 +1,192 @@ +import torch +import json +import os + + +version_config_paths = [ + os.path.join("v1", "32000.json"), + os.path.join("v1", "40000.json"), + os.path.join("v1", "48000.json"), + os.path.join("v2", "48000.json"), + os.path.join("v2", "40000.json"), + os.path.join("v2", "32000.json"), +] + + +def singleton(cls): + instances = {} + + def get_instance(*args, **kwargs): + if cls not in instances: + instances[cls] = cls(*args, **kwargs) + return instances[cls] + + return get_instance + + +@singleton +class Config: + def __init__(self): + self.device = "cuda:0" if torch.cuda.is_available() else "cpu" + self.is_half = self.device != "cpu" + self.gpu_name = ( + torch.cuda.get_device_name(int(self.device.split(":")[-1])) + if self.device.startswith("cuda") + else None + ) + self.json_config = self.load_config_json() + self.gpu_mem = None + self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() + + def load_config_json(self) -> dict: + configs = {} + for config_file in version_config_paths: + config_path = os.path.join( + "programs", "applio_code", "rvc", "configs", config_file + ) + with open(config_path, "r") as f: + configs[config_file] = json.load(f) + return configs + + def has_mps(self) -> bool: + # Check if Metal Performance Shaders are available - for macOS 12.3+. + return torch.backends.mps.is_available() + + def has_xpu(self) -> bool: + # Check if XPU is available. + return hasattr(torch, "xpu") and torch.xpu.is_available() + + def set_precision(self, precision): + if precision not in ["fp32", "fp16"]: + raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.") + + fp16_run_value = precision == "fp16" + preprocess_target_version = "3.7" if precision == "fp16" else "3.0" + preprocess_path = os.path.join( + os.path.dirname(__file__), + os.pardir, + "rvc", + "train", + "preprocess", + "preprocess.py", + ) + + for config_path in version_config_paths: + full_config_path = os.path.join( + "programs", "applio_code", "rvc", "configs", config_path + ) + try: + with open(full_config_path, "r") as f: + config = json.load(f) + config["train"]["fp16_run"] = fp16_run_value + with open(full_config_path, "w") as f: + json.dump(config, f, indent=4) + except FileNotFoundError: + print(f"File not found: {full_config_path}") + + if os.path.exists(preprocess_path): + with open(preprocess_path, "r") as f: + preprocess_content = f.read() + preprocess_content = preprocess_content.replace( + "3.0" if precision == "fp16" else "3.7", preprocess_target_version + ) + with open(preprocess_path, "w") as f: + f.write(preprocess_content) + + return f"Overwritten preprocess and config.json to use {precision}." + + def get_precision(self): + if not version_config_paths: + raise FileNotFoundError("No configuration paths provided.") + + full_config_path = os.path.join( + "programs", "applio_code", "rvc", "configs", version_config_paths[0] + ) + try: + with open(full_config_path, "r") as f: + config = json.load(f) + fp16_run_value = config["train"].get("fp16_run", False) + precision = "fp16" if fp16_run_value else "fp32" + return precision + except FileNotFoundError: + print(f"File not found: {full_config_path}") + return None + + def device_config(self) -> tuple: + if self.device.startswith("cuda"): + self.set_cuda_config() + elif self.has_mps(): + self.device = "mps" + self.is_half = False + self.set_precision("fp32") + else: + self.device = "cpu" + self.is_half = False + self.set_precision("fp32") + + # Configuration for 6GB GPU memory + x_pad, x_query, x_center, x_max = ( + (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41) + ) + if self.gpu_mem is not None and self.gpu_mem <= 4: + # Configuration for 5GB GPU memory + x_pad, x_query, x_center, x_max = (1, 5, 30, 32) + + return x_pad, x_query, x_center, x_max + + def set_cuda_config(self): + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + # Zluda + if self.gpu_name.endswith("[ZLUDA]"): + print("Zluda compatibility enabled, experimental feature.") + torch.backends.cudnn.enabled = False + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + torch.backends.cuda.enable_mem_efficient_sdp(False) + low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"] + if ( + any(gpu in self.gpu_name for gpu in low_end_gpus) + and "V100" not in self.gpu_name.upper() + ): + self.is_half = False + self.set_precision("fp32") + + self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // ( + 1024**3 + ) + + +def max_vram_gpu(gpu): + if torch.cuda.is_available(): + gpu_properties = torch.cuda.get_device_properties(gpu) + total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024) + return total_memory_gb + else: + return "0" + + +def get_gpu_info(): + ngpu = torch.cuda.device_count() + gpu_infos = [] + if torch.cuda.is_available() or ngpu != 0: + for i in range(ngpu): + gpu_name = torch.cuda.get_device_name(i) + mem = int( + torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024 + + 0.4 + ) + gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)") + if len(gpu_infos) > 0: + gpu_info = "\n".join(gpu_infos) + else: + gpu_info = "Unfortunately, there is no compatible GPU available to support your training." + return gpu_info + + +def get_number_of_gpus(): + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + return "-".join(map(str, range(num_gpus))) + else: + return "-" diff --git a/programs/applio_code/rvc/configs/v1/32000.json b/programs/applio_code/rvc/configs/v1/32000.json new file mode 100644 index 0000000000000000000000000000000000000000..2f28f4f68083acbca0d2d7864aaed24d67df7f53 --- /dev/null +++ b/programs/applio_code/rvc/configs/v1/32000.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,4,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/programs/applio_code/rvc/configs/v1/40000.json b/programs/applio_code/rvc/configs/v1/40000.json new file mode 100644 index 0000000000000000000000000000000000000000..3961ddb6412c3a8c4310ec965f1fd20e3622d2f4 --- /dev/null +++ b/programs/applio_code/rvc/configs/v1/40000.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/programs/applio_code/rvc/configs/v1/48000.json b/programs/applio_code/rvc/configs/v1/48000.json new file mode 100644 index 0000000000000000000000000000000000000000..41ea3b62f5c575d370ca1b8a66755959402950cb --- /dev/null +++ b/programs/applio_code/rvc/configs/v1/48000.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 11520, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 256, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/programs/applio_code/rvc/configs/v2/32000.json b/programs/applio_code/rvc/configs/v2/32000.json new file mode 100644 index 0000000000000000000000000000000000000000..eabab7b5317c3b47963bc1f7ad4c1c002dbf1939 --- /dev/null +++ b/programs/applio_code/rvc/configs/v2/32000.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,8,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [20,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/programs/applio_code/rvc/configs/v2/40000.json b/programs/applio_code/rvc/configs/v2/40000.json new file mode 100644 index 0000000000000000000000000000000000000000..e1ba44a9c0cfadb57d0fab15a62a4cf40872ffe8 --- /dev/null +++ b/programs/applio_code/rvc/configs/v2/40000.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/programs/applio_code/rvc/configs/v2/48000.json b/programs/applio_code/rvc/configs/v2/48000.json new file mode 100644 index 0000000000000000000000000000000000000000..1a4da9f5c669d3a39644b7a8ae827ca454c2cb3f --- /dev/null +++ b/programs/applio_code/rvc/configs/v2/48000.json @@ -0,0 +1,43 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 17280, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sample_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "text_enc_hidden_dim": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [12,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [24,20,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/programs/applio_code/rvc/infer/infer.py b/programs/applio_code/rvc/infer/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..3d1ceced25f9aaae174f2fdf8a6d5e195db68620 --- /dev/null +++ b/programs/applio_code/rvc/infer/infer.py @@ -0,0 +1,470 @@ +import os +import sys +import time +import torch +import librosa +import logging +import traceback +import numpy as np +import soundfile as sf + +from scipy.io import wavfile + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from programs.applio_code.rvc.infer.pipeline import Pipeline as VC +from programs.applio_code.rvc.lib.utils import load_audio_infer, load_embedding +from programs.applio_code.rvc.lib.tools.split_audio import process_audio, merge_audio +from programs.applio_code.rvc.lib.algorithm.synthesizers import Synthesizer +from programs.applio_code.rvc.configs.config import Config + +logging.getLogger("httpx").setLevel(logging.WARNING) +logging.getLogger("httpcore").setLevel(logging.WARNING) +logging.getLogger("faiss").setLevel(logging.WARNING) +logging.getLogger("faiss.loader").setLevel(logging.WARNING) + + +class VoiceConverter: + """ + A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method. + """ + + def __init__(self): + """ + Initializes the VoiceConverter with default configuration, and sets up models and parameters. + """ + self.config = Config() # Load RVC configuration + self.hubert_model = ( + None # Initialize the Hubert model (for embedding extraction) + ) + self.last_embedder_model = None # Last used embedder model + self.tgt_sr = None # Target sampling rate for the output audio + self.net_g = None # Generator network for voice conversion + self.vc = None # Voice conversion pipeline instance + self.cpt = None # Checkpoint for loading model weights + self.version = None # Model version + self.n_spk = None # Number of speakers in the model + self.use_f0 = None # Whether the model uses F0 + + def load_hubert(self, embedder_model: str, embedder_model_custom: str = None): + """ + Loads the HuBERT model for speaker embedding extraction. + """ + self.hubert_model = load_embedding(embedder_model, embedder_model_custom) + self.hubert_model.to(self.config.device) + self.hubert_model = ( + self.hubert_model.half() + if self.config.is_half + else self.hubert_model.float() + ) + self.hubert_model.eval() + + @staticmethod + def convert_audio_format(input_path, output_path, output_format): + """ + Converts an audio file to a specified output format. + """ + try: + if output_format != "WAV": + print(f"Converting audio to {output_format} format...") + audio, sample_rate = librosa.load(input_path, sr=None) + common_sample_rates = [ + 8000, + 11025, + 12000, + 16000, + 22050, + 24000, + 32000, + 44100, + 48000, + ] + target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate)) + audio = librosa.resample( + audio, orig_sr=sample_rate, target_sr=target_sr + ) + sf.write(output_path, audio, target_sr, format=output_format.lower()) + return output_path + except Exception as error: + print(f"An error occurred converting the audio format: {error}") + + def convert_audio( + self, + audio_input_path: str, + audio_output_path: str, + model_path: str, + index_path: str, + embedder_model: str, + pitch: int, + f0_file: str, + f0_method: str, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + split_audio: bool, + f0_autotune: bool, + filter_radius: int, + embedder_model_custom: str, + export_format: str, + resample_sr: int = 0, + sid: int = 0, + ): + """ + Performs voice conversion on the input audio. + """ + self.get_vc(model_path, sid) + + try: + start_time = time.time() + print(f"Converting audio '{audio_input_path}'...") + audio = load_audio_infer( + audio_input_path, + 16000, + ) + audio_max = np.abs(audio).max() / 0.95 + + if audio_max > 1: + audio /= audio_max + + if not self.hubert_model or embedder_model != self.last_embedder_model: + self.load_hubert(embedder_model, embedder_model_custom) + self.last_embedder_model = embedder_model + + file_index = ( + index_path.strip() + .strip('"') + .strip("\n") + .strip('"') + .strip() + .replace("trained", "added") + ) + + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + + if split_audio: + result, new_dir_path = process_audio(audio_input_path) + if result == "Error": + return "Error with Split Audio", None + + dir_path = ( + new_dir_path.strip().strip('"').strip("\n").strip('"').strip() + ) + if dir_path: + paths = [ + os.path.join(root, name) + for root, _, files in os.walk(dir_path, topdown=False) + for name in files + if name.endswith(".wav") and root == dir_path + ] + try: + for path in paths: + self.convert_audio( + audio_input_path=path, + audio_output_path=path, + model_path=model_path, + index_path=index_path, + sid=sid, + pitch=pitch, + f0_file=None, + f0_method=f0_method, + index_rate=index_rate, + resample_sr=resample_sr, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + split_audio=False, + f0_autotune=f0_autotune, + filter_radius=filter_radius, + export_format=export_format, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + ) + except Exception as error: + print(f"An error occurred processing the segmented audio: {error}") + print(traceback.format_exc()) + return f"Error {error}" + print("Finished processing segmented audio, now merging audio...") + merge_timestamps_file = os.path.join( + os.path.dirname(new_dir_path), + f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt", + ) + self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file) + os.remove(merge_timestamps_file) + sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV") + else: + audio_opt = self.vc.pipeline( + model=self.hubert_model, + net_g=self.net_g, + sid=sid, + audio=audio, + input_audio_path=audio_input_path, + pitch=pitch, + f0_method=f0_method, + file_index=file_index, + index_rate=index_rate, + pitch_guidance=self.use_f0, + filter_radius=filter_radius, + tgt_sr=self.tgt_sr, + resample_sr=resample_sr, + volume_envelope=volume_envelope, + version=self.version, + protect=protect, + hop_length=hop_length, + f0_autotune=f0_autotune, + f0_file=f0_file, + ) + + if audio_output_path: + sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV") + output_path_format = audio_output_path.replace( + ".wav", f".{export_format.lower()}" + ) + audio_output_path = self.convert_audio_format( + audio_output_path, output_path_format, export_format + ) + + elapsed_time = time.time() - start_time + print( + f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds." + ) + + except Exception as error: + print(f"An error occurred during audio conversion: {error}") + print(traceback.format_exc()) + + def convert_audio_batch( + self, + audio_input_paths: str, + audio_output_path: str, + model_path: str, + index_path: str, + embedder_model: str, + pitch: int, + f0_file: str, + f0_method: str, + index_rate: float, + volume_envelope: int, + protect: float, + hop_length: int, + split_audio: bool, + f0_autotune: bool, + filter_radius: int, + embedder_model_custom: str, + export_format: str, + resample_sr: int = 0, + sid: int = 0, + pid_file_path: str = None, + ): + """ + Performs voice conversion on a batch of input audio files. + """ + pid = os.getpid() + with open(pid_file_path, "w") as pid_file: + pid_file.write(str(pid)) + try: + if not self.hubert_model or embedder_model != self.last_embedder_model: + self.load_hubert(embedder_model, embedder_model_custom) + self.last_embedder_model = embedder_model + self.get_vc(model_path, sid) + file_index = ( + index_path.strip() + .strip('"') + .strip("\n") + .strip('"') + .strip() + .replace("trained", "added") + ) + start_time = time.time() + print(f"Converting audio batch '{audio_input_paths}'...") + audio_files = [ + f + for f in os.listdir(audio_input_paths) + if f.endswith((".mp3", ".wav", ".flac", ".m4a", ".ogg", ".opus")) + ] + print(f"Detected {len(audio_files)} audio files for inference.") + for i, audio_input_path in enumerate(audio_files): + audio_output_paths = os.path.join( + audio_output_path, + f"{os.path.splitext(os.path.basename(audio_input_path))[0]}_output.{export_format.lower()}", + ) + if os.path.exists(audio_output_paths): + continue + print(f"Converting audio '{audio_input_path}'...") + audio_input_path = os.path.join(audio_input_paths, audio_input_path) + + audio = load_audio_infer( + audio_input_path, + 16000, + ) + audio_max = np.abs(audio).max() / 0.95 + + if audio_max > 1: + audio /= audio_max + + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + + if split_audio: + result, new_dir_path = process_audio(audio_input_path) + if result == "Error": + return "Error with Split Audio", None + + dir_path = ( + new_dir_path.strip().strip('"').strip("\n").strip('"').strip() + ) + if dir_path: + paths = [ + os.path.join(root, name) + for root, _, files in os.walk(dir_path, topdown=False) + for name in files + if name.endswith(".wav") and root == dir_path + ] + try: + for path in paths: + self.convert_audio( + audio_input_path=path, + audio_output_path=path, + model_path=model_path, + index_path=index_path, + sid=sid, + pitch=pitch, + f0_file=None, + f0_method=f0_method, + index_rate=index_rate, + resample_sr=resample_sr, + volume_envelope=volume_envelope, + protect=protect, + hop_length=hop_length, + split_audio=False, + f0_autotune=f0_autotune, + filter_radius=filter_radius, + export_format=export_format, + embedder_model=embedder_model, + embedder_model_custom=embedder_model_custom, + ) + except Exception as error: + print( + f"An error occurred processing the segmented audio: {error}" + ) + print(traceback.format_exc()) + return f"Error {error}" + print("Finished processing segmented audio, now merging audio...") + merge_timestamps_file = os.path.join( + os.path.dirname(new_dir_path), + f"{os.path.basename(audio_input_path).split('.')[0]}_timestamps.txt", + ) + self.tgt_sr, audio_opt = merge_audio(merge_timestamps_file) + os.remove(merge_timestamps_file) + else: + audio_opt = self.vc.pipeline( + model=self.hubert_model, + net_g=self.net_g, + sid=sid, + audio=audio, + input_audio_path=audio_input_path, + pitch=pitch, + f0_method=f0_method, + file_index=file_index, + index_rate=index_rate, + pitch_guidance=self.use_f0, + filter_radius=filter_radius, + tgt_sr=self.tgt_sr, + resample_sr=resample_sr, + volume_envelope=volume_envelope, + version=self.version, + protect=protect, + hop_length=hop_length, + f0_autotune=f0_autotune, + f0_file=f0_file, + ) + + if audio_output_paths: + sf.write(audio_output_paths, audio_opt, self.tgt_sr, format="WAV") + output_path_format = audio_output_paths.replace( + ".wav", f".{export_format.lower()}" + ) + audio_output_paths = self.convert_audio_format( + audio_output_paths, output_path_format, export_format + ) + print(f"Conversion completed at '{audio_output_paths}'.") + elapsed_time = time.time() - start_time + print(f"Batch conversion completed in {elapsed_time:.2f} seconds.") + os.remove(pid_file_path) + except Exception as error: + print(f"An error occurred during audio conversion: {error}") + print(traceback.format_exc()) + + def get_vc(self, weight_root, sid): + """ + Loads the voice conversion model and sets up the pipeline. + """ + if sid == "" or sid == []: + self.cleanup_model() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + self.load_model(weight_root) + + if self.cpt is not None: + self.setup_network() + self.setup_vc_instance() + + def cleanup_model(self): + """ + Cleans up the model and releases resources. + """ + if self.hubert_model is not None: + del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr + self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + del self.net_g, self.cpt + if torch.cuda.is_available(): + torch.cuda.empty_cache() + self.cpt = None + + def load_model(self, weight_root): + """ + Loads the model weights from the specified path. + """ + self.cpt = ( + torch.load(weight_root, map_location="cpu") + if os.path.isfile(weight_root) + else None + ) + + def setup_network(self): + """ + Sets up the network configuration based on the loaded checkpoint. + """ + if self.cpt is not None: + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] + self.use_f0 = self.cpt.get("f0", 1) + + self.version = self.cpt.get("version", "v1") + self.text_enc_hidden_dim = 768 if self.version == "v2" else 256 + self.net_g = Synthesizer( + *self.cpt["config"], + use_f0=self.use_f0, + text_enc_hidden_dim=self.text_enc_hidden_dim, + is_half=self.config.is_half, + ) + del self.net_g.enc_q + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + self.net_g = ( + self.net_g.half() if self.config.is_half else self.net_g.float() + ) + + def setup_vc_instance(self): + """ + Sets up the voice conversion pipeline instance based on the target sampling rate and configuration. + """ + if self.cpt is not None: + self.vc = VC(self.tgt_sr, self.config) + self.n_spk = self.cpt["config"][-3] diff --git a/programs/applio_code/rvc/infer/pipeline.py b/programs/applio_code/rvc/infer/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..e29cbfd2dcc816f5ccddfc2be115bbb65433bc23 --- /dev/null +++ b/programs/applio_code/rvc/infer/pipeline.py @@ -0,0 +1,701 @@ +import os +import gc +import re +import sys +import torch +import torch.nn.functional as F +import torchcrepe +import faiss +import librosa +import numpy as np +from scipy import signal +from torch import Tensor + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from programs.applio_code.rvc.lib.predictors.RMVPE import RMVPE0Predictor +from programs.applio_code.rvc.lib.predictors.FCPE import FCPEF0Predictor + +import logging + +logging.getLogger("faiss").setLevel(logging.WARNING) + +# Constants for high-pass filter +FILTER_ORDER = 5 +CUTOFF_FREQUENCY = 48 # Hz +SAMPLE_RATE = 16000 # Hz +bh, ah = signal.butter( + N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE +) + +input_audio_path2wav = {} + + +class AudioProcessor: + """ + A class for processing audio signals, specifically for adjusting RMS levels. + """ + + def change_rms( + source_audio: np.ndarray, + source_rate: int, + target_audio: np.ndarray, + target_rate: int, + rate: float, + ) -> np.ndarray: + """ + Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate. + + Args: + source_audio: The source audio signal as a NumPy array. + source_rate: The sampling rate of the source audio. + target_audio: The target audio signal to adjust. + target_rate: The sampling rate of the target audio. + rate: The blending rate between the source and target RMS levels. + """ + # Calculate RMS of both audio data + rms1 = librosa.feature.rms( + y=source_audio, + frame_length=source_rate // 2 * 2, + hop_length=source_rate // 2, + ) + rms2 = librosa.feature.rms( + y=target_audio, + frame_length=target_rate // 2 * 2, + hop_length=target_rate // 2, + ) + + # Interpolate RMS to match target audio length + rms1 = F.interpolate( + torch.from_numpy(rms1).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = F.interpolate( + torch.from_numpy(rms2).float().unsqueeze(0), + size=target_audio.shape[0], + mode="linear", + ).squeeze() + rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6) + + # Adjust target audio RMS based on the source audio RMS + adjusted_audio = ( + target_audio + * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy() + ) + return adjusted_audio + + +class Autotune: + """ + A class for applying autotune to a given fundamental frequency (F0) contour. + """ + + def __init__(self, ref_freqs): + """ + Initializes the Autotune class with a set of reference frequencies. + + Args: + ref_freqs: A list of reference frequencies representing musical notes. + """ + self.ref_freqs = ref_freqs + self.note_dict = self.generate_interpolated_frequencies() + + def generate_interpolated_frequencies(self): + """ + Generates a dictionary of interpolated frequencies between reference frequencies. + """ + note_dict = [] + for i in range(len(self.ref_freqs) - 1): + freq_low = self.ref_freqs[i] + freq_high = self.ref_freqs[i + 1] + interpolated_freqs = np.linspace( + freq_low, freq_high, num=10, endpoint=False + ) + note_dict.extend(interpolated_freqs) + note_dict.append(self.ref_freqs[-1]) + return note_dict + + def autotune_f0(self, f0): + """ + Autotunes a given F0 contour by snapping each frequency to the closest reference frequency. + + Args: + f0: The input F0 contour as a NumPy array. + """ + autotuned_f0 = np.zeros_like(f0) + for i, freq in enumerate(f0): + closest_note = min(self.note_dict, key=lambda x: abs(x - freq)) + autotuned_f0[i] = closest_note + return autotuned_f0 + + +class Pipeline: + """ + The main pipeline class for performing voice conversion, including preprocessing, F0 estimation, + voice conversion using a model, and post-processing. + """ + + def __init__(self, tgt_sr, config): + """ + Initializes the Pipeline class with target sampling rate and configuration parameters. + + Args: + tgt_sr: The target sampling rate for the output audio. + config: A configuration object containing various parameters for the pipeline. + """ + self.x_pad = config.x_pad + self.x_query = config.x_query + self.x_center = config.x_center + self.x_max = config.x_max + self.is_half = config.is_half + self.sample_rate = 16000 + self.window = 160 + self.t_pad = self.sample_rate * self.x_pad + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sample_rate * self.x_query + self.t_center = self.sample_rate * self.x_center + self.t_max = self.sample_rate * self.x_max + self.time_step = self.window / self.sample_rate * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.device = config.device + self.ref_freqs = [ + 65.41, + 82.41, + 110.00, + 146.83, + 196.00, + 246.94, + 329.63, + 440.00, + 587.33, + 783.99, + 1046.50, + ] + self.autotune = Autotune(self.ref_freqs) + self.note_dict = self.autotune.note_dict + + def get_f0_crepe( + self, + x, + f0_min, + f0_max, + p_len, + hop_length, + model="full", + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model. + + Args: + x: The input audio signal as a NumPy array. + f0_min: Minimum F0 value to consider. + f0_max: Maximum F0 value to consider. + p_len: Desired length of the F0 output. + hop_length: Hop length for the Crepe model. + model: Crepe model size to use ("full" or "tiny"). + """ + x = x.astype(np.float32) + x /= np.quantile(np.abs(x), 0.999) + audio = torch.from_numpy(x).to(self.device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + audio = audio.detach() + pitch: Tensor = torchcrepe.predict( + audio, + self.sample_rate, + hop_length, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=self.device, + pad=True, + ) + p_len = p_len or x.shape[0] // hop_length + source = np.array(pitch.squeeze(0).cpu().float().numpy()) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * p_len, len(source)) / p_len, + np.arange(0, len(source)), + source, + ) + f0 = np.nan_to_num(target) + return f0 + + def get_f0_hybrid( + self, + methods_str, + x, + f0_min, + f0_max, + p_len, + hop_length, + ): + """ + Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods. + + Args: + methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]"). + x: The input audio signal as a NumPy array. + f0_min: Minimum F0 value to consider. + f0_max: Maximum F0 value to consider. + p_len: Desired length of the F0 output. + hop_length: Hop length for F0 estimation methods. + """ + methods_str = re.search("hybrid\[(.+)\]", methods_str) + if methods_str: + methods = [method.strip() for method in methods_str.group(1).split("+")] + f0_computation_stack = [] + print(f"Calculating f0 pitch estimations for methods {str(methods)}") + x = x.astype(np.float32) + x /= np.quantile(np.abs(x), 0.999) + for method in methods: + f0 = None + if method == "crepe": + f0 = self.get_f0_crepe_computation( + x, f0_min, f0_max, p_len, int(hop_length) + ) + elif method == "rmvpe": + self.model_rmvpe = RMVPE0Predictor( + os.path.join( + "programs", + "applio_code", + "rvc", + "models", + "predictors", + "rmvpe.pt", + ), + is_half=self.is_half, + device=self.device, + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 = f0[1:] + elif method == "fcpe": + self.model_fcpe = FCPEF0Predictor( + os.path.join( + "programs", + "applio_code", + "rvc", + "models", + "predictors", + "fcpe.pt", + ), + f0_min=int(f0_min), + f0_max=int(f0_max), + dtype=torch.float32, + device=self.device, + sample_rate=self.sample_rate, + threshold=0.03, + ) + f0 = self.model_fcpe.compute_f0(x, p_len=p_len) + del self.model_fcpe + gc.collect() + f0_computation_stack.append(f0) + + f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None] + f0_median_hybrid = None + if len(f0_computation_stack) == 1: + f0_median_hybrid = f0_computation_stack[0] + else: + f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0) + return f0_median_hybrid + + def get_f0( + self, + input_audio_path, + x, + p_len, + pitch, + f0_method, + filter_radius, + hop_length, + f0_autotune, + inp_f0=None, + ): + """ + Estimates the fundamental frequency (F0) of a given audio signal using various methods. + + Args: + input_audio_path: Path to the input audio file. + x: The input audio signal as a NumPy array. + p_len: Desired length of the F0 output. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation (e.g., "crepe"). + filter_radius: Radius for median filtering the F0 contour. + hop_length: Hop length for F0 estimation methods. + f0_autotune: Whether to apply autotune to the F0 contour. + inp_f0: Optional input F0 contour to use instead of estimating. + """ + global input_audio_path2wav + if f0_method == "crepe": + f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length)) + elif f0_method == "crepe-tiny": + f0 = self.get_f0_crepe( + x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny" + ) + elif f0_method == "rmvpe": + self.model_rmvpe = RMVPE0Predictor( + os.path.join( + "programs", "applio_code", "rvc", "models", "predictors", "rmvpe.pt" + ), + is_half=self.is_half, + device=self.device, + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + elif f0_method == "fcpe": + self.model_fcpe = FCPEF0Predictor( + os.path.join( + "programs", "applio_code", "rvc", "models", "predictors", "fcpe.pt" + ), + f0_min=int(self.f0_min), + f0_max=int(self.f0_max), + dtype=torch.float32, + device=self.device, + sample_rate=self.sample_rate, + threshold=0.03, + ) + f0 = self.model_fcpe.compute_f0(x, p_len=p_len) + del self.model_fcpe + gc.collect() + elif "hybrid" in f0_method: + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = self.get_f0_hybrid( + f0_method, + x, + self.f0_min, + self.f0_max, + p_len, + hop_length, + ) + + if f0_autotune == "True": + f0 = Autotune.autotune_f0(self, f0) + + f0 *= pow(2, pitch / 12) + tf0 = self.sample_rate // self.window + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / ( + self.f0_mel_max - self.f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + + return f0_coarse, f0bak + + def voice_conversion( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + index, + big_npy, + index_rate, + version, + protect, + ): + """ + Performs voice conversion on a given audio segment. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio0: The input audio segment. + pitch: Quantized F0 contour for pitch guidance. + pitchf: Original F0 contour for pitch guidance. + index: FAISS index for speaker embedding retrieval. + big_npy: Speaker embeddings stored in a NumPy array. + index_rate: Blending rate for speaker embedding retrieval. + version: Model version ("v1" or "v2"). + protect: Protection level for preserving the original pitch. + """ + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + with torch.no_grad(): + feats = model(feats.to(self.device))["last_hidden_state"] + feats = ( + model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats + ) + if protect < 0.5 and pitch != None and pitchf != None: + feats0 = feats.clone() + if ( + isinstance(index, type(None)) == False + and isinstance(big_npy, type(None)) == False + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch != None and pitchf != None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch != None and pitchf != None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch != None and pitchf != None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + if pitch != None and pitchf != None: + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + else: + audio1 = ( + (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + pitch, + f0_method, + file_index, + index_rate, + pitch_guidance, + filter_radius, + tgt_sr, + resample_sr, + volume_envelope, + version, + protect, + hop_length, + f0_autotune, + f0_file, + ): + """ + The main pipeline function for performing voice conversion. + + Args: + model: The feature extractor model. + net_g: The generative model for synthesizing speech. + sid: Speaker ID for the target voice. + audio: The input audio signal. + input_audio_path: Path to the input audio file. + pitch: Key to adjust the pitch of the F0 contour. + f0_method: Method to use for F0 estimation. + file_index: Path to the FAISS index file for speaker embedding retrieval. + index_rate: Blending rate for speaker embedding retrieval. + pitch_guidance: Whether to use pitch guidance during voice conversion. + filter_radius: Radius for median filtering the F0 contour. + tgt_sr: Target sampling rate for the output audio. + resample_sr: Resampling rate for the output audio. + volume_envelope: Blending rate for adjusting the RMS level of the output audio. + version: Model version. + protect: Protection level for preserving the original pitch. + hop_length: Hop length for F0 estimation methods. + f0_autotune: Whether to apply autotune to the F0 contour. + f0_file: Path to a file containing an F0 contour to use. + """ + if file_index != "" and os.path.exists(file_index) == True and index_rate != 0: + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + print(f"An error occurred reading the FAISS index: {error}") + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name") == True: + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except Exception as error: + print(f"An error occurred reading the F0 file: {error}") + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + if pitch_guidance == True: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + pitch, + f0_method, + filter_radius, + hop_length, + f0_autotune, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + for t in opt_ts: + t = t // self.window * self.window + if pitch_guidance == True: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if pitch_guidance == True: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.voice_conversion( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if volume_envelope != 1: + audio_opt = AudioProcessor.change_rms( + audio, self.sample_rate, audio_opt, tgt_sr, volume_envelope + ) + if resample_sr >= self.sample_rate and tgt_sr != resample_sr: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/programs/applio_code/rvc/lib/algorithm/__init__.py b/programs/applio_code/rvc/lib/algorithm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/programs/applio_code/rvc/lib/algorithm/attentions.py b/programs/applio_code/rvc/lib/algorithm/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..1a1c994720c1369d81d69aa54758ace8e8ddccc5 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/attentions.py @@ -0,0 +1,292 @@ +import math +import torch + +from programs.applio_code.rvc.lib.algorithm.commons import convert_pad_shape + + +class MultiHeadAttention(torch.nn.Module): + """ + Multi-head attention module with optional relative positional encoding and proximal bias. + + Args: + channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_heads (int): Number of attention heads. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to None. + heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True. + block_length (int, optional): Block length for local attention. Defaults to None. + proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False. + proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False. + """ + + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = torch.nn.Conv1d(channels, channels, 1) + self.conv_k = torch.nn.Conv1d(channels, channels, 1) + self.conv_v = torch.nn.Conv1d(channels, channels, 1) + self.conv_o = torch.nn.Conv1d(channels, out_channels, 1) + self.drop = torch.nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = torch.nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = torch.nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + torch.nn.init.xavier_uniform_(self.conv_q.weight) + torch.nn.init.xavier_uniform_(self.conv_k.weight) + torch.nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = torch.nn.functional.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = torch.nn.functional.pad( + relative_embeddings, + convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + + # Concat columns of pad to shift from relative to absolute indexing. + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]) + ) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = torch.nn.functional.pad( + x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = torch.nn.functional.pad( + x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]) + ) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(torch.nn.Module): + """ + Feed-forward network module. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + filter_channels (int): Number of filter channels in the convolution layers. + kernel_size (int): Kernel size of the convolution layers. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + activation (str, optional): Activation function to use. Defaults to None. + causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False. + """ + + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = torch.nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = torch.nn.functional.pad(x, convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = torch.nn.functional.pad(x, convert_pad_shape(padding)) + return x diff --git a/programs/applio_code/rvc/lib/algorithm/commons.py b/programs/applio_code/rvc/lib/algorithm/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..c76328c9188d48b296a297c0599a4d825dc9150f --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/commons.py @@ -0,0 +1,225 @@ +import math +import torch +from typing import List, Optional + + +def init_weights(m, mean=0.0, std=0.01): + """ + Initialize the weights of a module. + + Args: + m: The module to initialize. + mean: The mean of the normal distribution. + std: The standard deviation of the normal distribution. + """ + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + """ + Calculate the padding needed for a convolution. + + Args: + kernel_size: The size of the kernel. + dilation: The dilation of the convolution. + """ + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + """ + Convert the pad shape to a list of integers. + + Args: + pad_shape: The pad shape.. + """ + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """ + Calculate the KL divergence between two distributions. + + Args: + m_p: The mean of the first distribution. + logs_p: The log of the standard deviation of the first distribution. + m_q: The mean of the second distribution. + logs_q: The log of the standard deviation of the second distribution. + """ + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def slice_segments( + x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2 +): + """ + Slice segments from a tensor, handling tensors with different numbers of dimensions. + + Args: + x (torch.Tensor): The tensor to slice. + ids_str (torch.Tensor): The starting indices of the segments. + segment_size (int, optional): The size of each segment. Defaults to 4. + dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2. + """ + if dim == 2: + ret = torch.zeros_like(x[:, :segment_size]) + elif dim == 3: + ret = torch.zeros_like(x[:, :, :segment_size]) + + for i in range(x.size(0)): + idx_str = ids_str[i].item() + idx_end = idx_str + segment_size + if dim == 2: + ret[i] = x[i, idx_str:idx_end] + else: + ret[i] = x[i, :, idx_str:idx_end] + + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + """ + Randomly slice segments from a tensor. + + Args: + x: The tensor to slice. + x_lengths: The lengths of the sequences. + segment_size: The size of each segment. + """ + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size, dim=3) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + """ + Generate a 1D timing signal. + + Args: + length: The length of the signal. + channels: The number of channels of the signal. + min_timescale: The minimum timescale. + max_timescale: The maximum timescale. + """ + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def subsequent_mask(length): + """ + Generate a subsequent mask. + + Args: + length: The length of the sequence. + """ + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + """ + Fused add tanh sigmoid multiply operation. + + Args: + input_a: The first input tensor. + input_b: The second input tensor. + n_channels: The number of channels. + """ + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +# Zluda, same as previous, but without jit.script +def fused_add_tanh_sigmoid_multiply_no_jit(input_a, input_b, n_channels): + """ + Fused add tanh sigmoid multiply operation. + + Args: + input_a: The first input tensor. + input_b: The second input tensor. + n_channels: The number of channels. + """ + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]: + """ + Convert the pad shape to a list of integers. + + Args: + pad_shape: The pad shape. + """ + return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist() + + +def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None): + """ + Generate a sequence mask. + + Args: + length: The lengths of the sequences. + max_length: The maximum length of the sequences. + """ + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def clip_grad_value(parameters, clip_value, norm_type=2): + """ + Clip the gradients of a list of parameters. + + Args: + parameters: The list of parameters to clip. + clip_value: The maximum value of the gradients. + norm_type: The type of norm to use for clipping. + """ + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/programs/applio_code/rvc/lib/algorithm/discriminators.py b/programs/applio_code/rvc/lib/algorithm/discriminators.py new file mode 100644 index 0000000000000000000000000000000000000000..e9cee7bf80f192bb7e09b04f5c465037c26850b0 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/discriminators.py @@ -0,0 +1,199 @@ +import torch +from torch.nn.utils.parametrizations import spectral_norm, weight_norm + +from programs.applio_code.rvc.lib.algorithm.commons import get_padding +from programs.applio_code.rvc.lib.algorithm.residuals import LRELU_SLOPE + + +class MultiPeriodDiscriminator(torch.nn.Module): + """ + Multi-period discriminator. + + This class implements a multi-period discriminator, which is used to + discriminate between real and fake audio signals. The discriminator + is composed of a series of convolutional layers that are applied to + the input signal at different periods. + + Args: + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + """ + + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + self.discriminators = torch.nn.ModuleList( + [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + ) + + def forward(self, y, y_hat): + """ + Forward pass of the multi-period discriminator. + + Args: + y (torch.Tensor): Real audio signal. + y_hat (torch.Tensor): Fake audio signal. + """ + y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] + for d in self.discriminators: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + """ + Multi-period discriminator V2. + + This class implements a multi-period discriminator V2, which is used + to discriminate between real and fake audio signals. The discriminator + is composed of a series of convolutional layers that are applied to + the input signal at different periods. + + Args: + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + """ + + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + periods = [2, 3, 5, 7, 11, 17, 23, 37] + self.discriminators = torch.nn.ModuleList( + [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods] + ) + + def forward(self, y, y_hat): + """ + Forward pass of the multi-period discriminator V2. + + Args: + y (torch.Tensor): Real audio signal. + y_hat (torch.Tensor): Fake audio signal. + """ + y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], [] + for d in self.discriminators: + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + """ + Discriminator for the short-term component. + + This class implements a discriminator for the short-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal. + """ + + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = spectral_norm if use_spectral_norm else weight_norm + self.convs = torch.nn.ModuleList( + [ + norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)), + norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1)) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + """ + Forward pass of the discriminator. + + Args: + x (torch.Tensor): Input audio signal. + """ + fmap = [] + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + """ + Discriminator for the long-term component. + + This class implements a discriminator for the long-term component + of the audio signal. The discriminator is composed of a series of + convolutional layers that are applied to the input signal at a given + period. + + Args: + period (int): Period of the discriminator. + kernel_size (int): Kernel size of the convolutional layers. + Defaults to 5. + stride (int): Stride of the convolutional layers. Defaults to 3. + use_spectral_norm (bool): Whether to use spectral normalization. + Defaults to False. + """ + + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + norm_f = spectral_norm if use_spectral_norm else weight_norm + + in_channels = [1, 32, 128, 512, 1024] + out_channels = [32, 128, 512, 1024, 1024] + + self.convs = torch.nn.ModuleList( + [ + norm_f( + torch.nn.Conv2d( + in_ch, + out_ch, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ) + for in_ch, out_ch in zip(in_channels, out_channels) + ] + ) + + self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE) + + def forward(self, x): + """ + Forward pass of the discriminator. + + Args: + x (torch.Tensor): Input audio signal. + """ + fmap = [] + b, c, t = x.shape + if t % self.period != 0: + n_pad = self.period - (t % self.period) + x = torch.nn.functional.pad(x, (0, n_pad), "reflect") + x = x.view(b, c, -1, self.period) + + for conv in self.convs: + x = self.lrelu(conv(x)) + fmap.append(x) + + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + return x, fmap diff --git a/programs/applio_code/rvc/lib/algorithm/encoders.py b/programs/applio_code/rvc/lib/algorithm/encoders.py new file mode 100644 index 0000000000000000000000000000000000000000..ecf6b6b480e343abdc83b1cd27fd3c0d6c4d7993 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/encoders.py @@ -0,0 +1,219 @@ +import math +import torch +from typing import Optional + +from programs.applio_code.rvc.lib.algorithm.commons import sequence_mask +from programs.applio_code.rvc.lib.algorithm.modules import WaveNet +from programs.applio_code.rvc.lib.algorithm.normalization import LayerNorm +from programs.applio_code.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention + + +class Encoder(torch.nn.Module): + """ + Encoder module for the Transformer model. + + Args: + hidden_channels (int): Number of hidden channels in the encoder. + filter_channels (int): Number of filter channels in the feed-forward network. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1. + p_dropout (float, optional): Dropout probability. Defaults to 0.0. + window_size (int, optional): Window size for relative positional encoding. Defaults to 10. + """ + + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = torch.nn.Dropout(p_dropout) + self.attn_layers = torch.nn.ModuleList() + self.norm_layers_1 = torch.nn.ModuleList() + self.ffn_layers = torch.nn.ModuleList() + self.norm_layers_2 = torch.nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class TextEncoder(torch.nn.Module): + """Text Encoder with configurable embedding dimension. + + Args: + out_channels (int): Output channels of the encoder. + hidden_channels (int): Hidden channels of the encoder. + filter_channels (int): Filter channels of the encoder. + n_heads (int): Number of attention heads. + n_layers (int): Number of encoder layers. + kernel_size (int): Kernel size of the convolutional layers. + p_dropout (float): Dropout probability. + embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768). + f0 (bool, optional): Whether to use F0 embedding. Defaults to True. + """ + + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + embedding_dim, + f0=True, + ): + super(TextEncoder, self).__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels) + self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True) + if f0: + self.emb_pitch = torch.nn.Embedding(256, hidden_channels) + self.encoder = Encoder( + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor + ): + if pitch is None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class PosteriorEncoder(torch.nn.Module): + """Posterior Encoder for inferring latent representation. + + Args: + in_channels (int): Number of channels in the input. + out_channels (int): Number of channels in the output. + hidden_channels (int): Number of hidden channels in the encoder. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the encoder. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super(PosteriorEncoder, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward( + self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None + ): + x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + """Removes weight normalization from the encoder.""" + self.enc.remove_weight_norm() + + def __prepare_scriptable__(self): + """Prepares the module for scripting.""" + for hook in self.enc._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc) + return self diff --git a/programs/applio_code/rvc/lib/algorithm/generators.py b/programs/applio_code/rvc/lib/algorithm/generators.py new file mode 100644 index 0000000000000000000000000000000000000000..4c8d6d3024ba23186fd7826054e3429d9a4b2637 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/generators.py @@ -0,0 +1,199 @@ +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from typing import Optional + +from programs.applio_code.rvc.lib.algorithm.residuals import ( + LRELU_SLOPE, + ResBlock1, + ResBlock2, +) +from programs.applio_code.rvc.lib.algorithm.commons import init_weights + + +class Generator(torch.nn.Module): + """Generator for synthesizing audio. Optimized for performance and quality. + + Args: + initial_channel (int): Number of channels in the initial convolutional layer. + resblock (str): Type of residual block to use (1 or 2). + resblock_kernel_sizes (list): Kernel sizes of the residual blocks. + resblock_dilation_sizes (list): Dilation rates of the residual blocks. + upsample_rates (list): Upsampling rates. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = torch.nn.Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = ResBlock1 if resblock == "1" else ResBlock2 + + self.ups_and_resblocks = torch.nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups_and_resblocks.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.ups_and_resblocks.append(resblock(ch, k, d)) + + self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups_and_resblocks.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + resblock_idx = 0 + for _ in range(self.num_upsamples): + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + x = self.ups_and_resblocks[resblock_idx](x) + resblock_idx += 1 + xs = 0 + for _ in range(self.num_kernels): + xs += self.ups_and_resblocks[resblock_idx](x) + resblock_idx += 1 + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def __prepare_scriptable__(self): + """Prepares the module for scripting.""" + for l in self.ups_and_resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(l) + return self + + def remove_weight_norm(self): + """Removes weight normalization from the upsampling and residual blocks.""" + for l in self.ups_and_resblocks: + remove_weight_norm(l) + + +class SineGen(torch.nn.Module): + """Sine wave generator. + + Args: + samp_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonic overtones. Defaults to 0. + sine_amp (float, optional): Amplitude of sine waveform. Defaults to 0.1. + noise_std (float, optional): Standard deviation of Gaussian noise. Defaults to 0.003. + voiced_threshold (float, optional): F0 threshold for voiced/unvoiced classification. Defaults to 0. + flag_for_pulse (bool, optional): Whether this SineGen is used inside PulseGen. Defaults to False. + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sample_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + """Converts F0 to voiced/unvoiced signal. + + Args: + f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1).. + """ + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0: torch.Tensor, upp: int): + """Generates sine waves. + + Args: + f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1). + upp (int): Upsampling factor. + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + f0_buf[:, :, 0] = f0[:, :, 0] + f0_buf[:, :, 1:] = ( + f0_buf[:, :, 0:1] + * torch.arange(2, self.harmonic_num + 2, device=f0.device)[ + None, None, : + ] + ) + rad_values = (f0_buf / float(self.sample_rate)) % 1 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) + tmp_over_one *= upp + tmp_over_one = torch.nn.functional.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=float(upp), + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = torch.nn.functional.interpolate( + rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose(2, 1) + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = torch.nn.functional.interpolate( + uv.transpose(2, 1), scale_factor=float(upp), mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise diff --git a/programs/applio_code/rvc/lib/algorithm/modules.py b/programs/applio_code/rvc/lib/algorithm/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..5be0b0855d81a666ba573498207e03c5d17808b7 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/modules.py @@ -0,0 +1,130 @@ +import torch +from programs.applio_code.rvc.lib.algorithm.commons import ( + fused_add_tanh_sigmoid_multiply_no_jit, + fused_add_tanh_sigmoid_multiply, +) + + +class WaveNet(torch.nn.Module): + """WaveNet residual blocks as used in WaveGlow + + Args: + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + p_dropout (float, optional): Dropout probability. Defaults to 0. + """ + + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WaveNet, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = torch.nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.parametrizations.weight_norm( + cond_layer, name="weight" + ) + + dilations = [dilation_rate**i for i in range(n_layers)] + paddings = [(kernel_size * d - d) // 2 for d in dilations] + + for i in range(n_layers): + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilations[i], + padding=paddings[i], + ) + in_layer = torch.nn.utils.parametrizations.weight_norm( + in_layer, name="weight" + ) + self.in_layers.append(in_layer) + + res_skip_channels = ( + hidden_channels if i == n_layers - 1 else 2 * hidden_channels + ) + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.parametrizations.weight_norm( + res_skip_layer, name="weight" + ) + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, hidden_channels, time_steps). + x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps). + g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps). + Defaults to None. + """ + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + # Zluda + is_zluda = x.device.type == "cuda" and torch.cuda.get_device_name().endswith( + "[ZLUDA]" + ) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + # Preventing HIP crash by not using jit-decorated function + if is_zluda: + acts = fused_add_tanh_sigmoid_multiply_no_jit( + x_in, g_l, n_channels_tensor + ) + else: + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + """Remove weight normalization from the module.""" + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) diff --git a/programs/applio_code/rvc/lib/algorithm/normalization.py b/programs/applio_code/rvc/lib/algorithm/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..878ec09de09b021bc9a2b92def21e07d42f34c75 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/normalization.py @@ -0,0 +1,31 @@ +import torch + + +class LayerNorm(torch.nn.Module): + """Layer normalization module. + + Args: + channels (int): Number of channels. + eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5. + """ + + def __init__(self, channels, eps=1e-5): + super().__init__() + self.eps = eps + self.gamma = torch.nn.Parameter(torch.ones(channels)) + self.beta = torch.nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). + + """ + # Transpose to (batch_size, time_steps, channels) for layer_norm + x = x.transpose(1, -1) + x = torch.nn.functional.layer_norm( + x, (x.size(-1),), self.gamma, self.beta, self.eps + ) + # Transpose back to (batch_size, channels, time_steps) + return x.transpose(1, -1) diff --git a/programs/applio_code/rvc/lib/algorithm/nsf.py b/programs/applio_code/rvc/lib/algorithm/nsf.py new file mode 100644 index 0000000000000000000000000000000000000000..8dfd6c17795b211faeaa4f2567adae2310566ccb --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/nsf.py @@ -0,0 +1,200 @@ +import math +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm +from typing import Optional + +from programs.applio_code.rvc.lib.algorithm.generators import SineGen +from programs.applio_code.rvc.lib.algorithm.residuals import ( + LRELU_SLOPE, + ResBlock1, + ResBlock2, +) +from programs.applio_code.rvc.lib.algorithm.commons import init_weights + + +class SourceModuleHnNSF(torch.nn.Module): + """ + Source Module for harmonic-plus-noise excitation. + + Args: + sample_rate (int): Sampling rate in Hz. + harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0. + sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1. + add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003. + voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0. + is_half (bool, optional): Whether to use half precision. Defaults to True. + """ + + def __init__( + self, + sample_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + + self.l_sin_gen = SineGen( + sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x: torch.Tensor, upsample_factor: int = 1): + sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor) + sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype) + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None + + +class GeneratorNSF(torch.nn.Module): + """ + Generator for synthesizing audio using the NSF (Neural Source Filter) approach. + + Args: + initial_channel (int): Number of channels in the initial convolutional layer. + resblock (str): Type of residual block to use (1 or 2). + resblock_kernel_sizes (list): Kernel sizes of the residual blocks. + resblock_dilation_sizes (list): Dilation rates of the residual blocks. + upsample_rates (list): Upsampling rates. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes of the upsampling layers. + gin_channels (int): Number of channels for the global conditioning input. + sr (int): Sampling rate. + is_half (bool, optional): Whether to use half precision. Defaults to False. + """ + + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sample_rate=sr, harmonic_num=0, is_half=is_half + ) + + self.conv_pre = torch.nn.Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock_cls = ResBlock1 if resblock == "1" else ResBlock2 + + self.ups = torch.nn.ModuleList() + self.noise_convs = torch.nn.ModuleList() + + channels = [ + upsample_initial_channel // (2 ** (i + 1)) + for i in range(len(upsample_rates)) + ] + stride_f0s = [ + math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1 + for i in range(len(upsample_rates)) + ] + + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + torch.nn.ConvTranspose1d( + upsample_initial_channel // (2**i), + channels[i], + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.noise_convs.append( + torch.nn.Conv1d( + 1, + channels[i], + kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1), + stride=stride_f0s[i], + padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0), + ) + ) + + self.resblocks = torch.nn.ModuleList( + [ + resblock_cls(channels[i], k, d) + for i in range(len(self.ups)) + for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes) + ] + ) + + self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = math.prod(upsample_rates) + self.lrelu_slope = LRELU_SLOPE + + def forward(self, x, f0, g: Optional[torch.Tensor] = None): + har_source, _, _ = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + + if g is not None: + x = x + self.cond(g) + + for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)): + x = torch.nn.functional.leaky_relu(x, self.lrelu_slope) + x = ups(x) + x = x + noise_convs(har_source) + + xs = sum( + [ + resblock(x) + for j, resblock in enumerate(self.resblocks) + if j in range(i * self.num_kernels, (i + 1) * self.num_kernels) + ] + ) + x = xs / self.num_kernels + + x = torch.nn.functional.leaky_relu(x) + x = torch.tanh(self.conv_post(x)) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + def __prepare_scriptable__(self): + for l in self.ups: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + for l in self.resblocks: + for hook in l._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + remove_weight_norm(l) + return self diff --git a/programs/applio_code/rvc/lib/algorithm/residuals.py b/programs/applio_code/rvc/lib/algorithm/residuals.py new file mode 100644 index 0000000000000000000000000000000000000000..23445a8f3de3faa51f29d288553bab053247a4e5 --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/residuals.py @@ -0,0 +1,309 @@ +from typing import Optional +import torch +from torch.nn.utils import remove_weight_norm +from torch.nn.utils.parametrizations import weight_norm + +from programs.applio_code.rvc.lib.algorithm.modules import WaveNet +from programs.applio_code.rvc.lib.algorithm.commons import get_padding, init_weights + +LRELU_SLOPE = 0.1 + + +# Helper functions +def create_conv1d_layer(channels, kernel_size, dilation): + return weight_norm( + torch.nn.Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation, + padding=get_padding(kernel_size, dilation), + ) + ) + + +def apply_mask(tensor, mask): + return tensor * mask if mask is not None else tensor + + +class ResBlockBase(torch.nn.Module): + def __init__(self, channels, kernel_size, dilations): + super(ResBlockBase, self).__init__() + self.convs1 = torch.nn.ModuleList( + [create_conv1d_layer(channels, kernel_size, d) for d in dilations] + ) + self.convs1.apply(init_weights) + + self.convs2 = torch.nn.ModuleList( + [create_conv1d_layer(channels, kernel_size, 1) for _ in dilations] + ) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) + xt = apply_mask(xt, x_mask) + xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE) + xt = apply_mask(xt, x_mask) + xt = c2(xt) + x = xt + x + return apply_mask(x, x_mask) + + def remove_weight_norm(self): + for conv in self.convs1 + self.convs2: + remove_weight_norm(conv) + + +class ResBlock1(ResBlockBase): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__(channels, kernel_size, dilation) + + +class ResBlock2(ResBlockBase): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__(channels, kernel_size, dilation) + + +class Log(torch.nn.Module): + """Logarithm module for flow-based models. + + This module computes the logarithm of the input and its log determinant. + During reverse, it computes the exponential of the input. + """ + + def forward(self, x, x_mask, reverse=False, **kwargs): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor. + x_mask (torch.Tensor): Mask tensor. + reverse (bool, optional): Whether to reverse the operation. Defaults to False. + """ + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(torch.nn.Module): + """Flip module for flow-based models. + + This module flips the input along the time dimension. + """ + + def forward(self, x, *args, reverse=False, **kwargs): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor. + reverse (bool, optional): Whether to reverse the operation. Defaults to False. + """ + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(torch.nn.Module): + """Elementwise affine transformation module for flow-based models. + + This module performs an elementwise affine transformation on the input. + + Args: + channels (int): Number of channels. + + """ + + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = torch.nn.Parameter(torch.zeros(channels, 1)) + self.logs = torch.nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor. + x_mask (torch.Tensor): Mask tensor. + reverse (bool, optional): Whether to reverse the operation. Defaults to False. + """ + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingBlock(torch.nn.Module): + """Residual Coupling Block for normalizing flow. + + Args: + channels (int): Number of channels in the input. + hidden_channels (int): Number of hidden channels in the coupling layer. + kernel_size (int): Kernel size of the convolutional layers. + dilation_rate (int): Dilation rate of the convolutional layers. + n_layers (int): Number of layers in the coupling layer. + n_flows (int, optional): Number of coupling layers in the block. Defaults to 4. + gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0. + """ + + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super(ResidualCouplingBlock, self).__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = torch.nn.ModuleList() + for i in range(n_flows): + self.flows.append( + ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(Flip()) + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, + g: Optional[torch.Tensor] = None, + reverse: bool = False, + ): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow.forward(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + """Removes weight normalization from the coupling layers.""" + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + def __prepare_scriptable__(self): + """Prepares the module for scripting.""" + for i in range(self.n_flows): + for hook in self.flows[i * 2]._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flows[i * 2]) + + return self + + +class ResidualCouplingLayer(torch.nn.Module): + """Residual coupling layer for flow-based models. + + Args: + channels (int): Number of channels. + hidden_channels (int): Number of hidden channels. + kernel_size (int): Size of the convolutional kernel. + dilation_rate (int): Dilation rate of the convolution. + n_layers (int): Number of convolutional layers. + p_dropout (float, optional): Dropout probability. Defaults to 0. + gin_channels (int, optional): Number of conditioning channels. Defaults to 0. + mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False. + """ + + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WaveNet( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = torch.nn.Conv1d( + hidden_channels, self.half_channels * (2 - mean_only), 1 + ) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + """Forward pass. + + Args: + x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps). + x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps). + g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps). + Defaults to None. + reverse (bool, optional): Whether to reverse the operation. Defaults to False. + """ + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + """Remove weight normalization from the module.""" + self.enc.remove_weight_norm() diff --git a/programs/applio_code/rvc/lib/algorithm/synthesizers.py b/programs/applio_code/rvc/lib/algorithm/synthesizers.py new file mode 100644 index 0000000000000000000000000000000000000000..161da9bac30c345e62155487027aa63ff946f5db --- /dev/null +++ b/programs/applio_code/rvc/lib/algorithm/synthesizers.py @@ -0,0 +1,243 @@ +import torch +from typing import Optional + +from programs.applio_code.rvc.lib.algorithm.nsf import GeneratorNSF +from programs.applio_code.rvc.lib.algorithm.generators import Generator +from programs.applio_code.rvc.lib.algorithm.commons import ( + slice_segments, + rand_slice_segments, +) +from programs.applio_code.rvc.lib.algorithm.residuals import ResidualCouplingBlock +from programs.applio_code.rvc.lib.algorithm.encoders import ( + TextEncoder, + PosteriorEncoder, +) + + +class Synthesizer(torch.nn.Module): + """ + Base Synthesizer model. + + Args: + spec_channels (int): Number of channels in the spectrogram. + segment_size (int): Size of the audio segment. + inter_channels (int): Number of channels in the intermediate layers. + hidden_channels (int): Number of channels in the hidden layers. + filter_channels (int): Number of channels in the filter layers. + n_heads (int): Number of attention heads. + n_layers (int): Number of layers in the encoder. + kernel_size (int): Size of the convolution kernel. + p_dropout (float): Dropout probability. + resblock (str): Type of residual block. + resblock_kernel_sizes (list): Kernel sizes for the residual blocks. + resblock_dilation_sizes (list): Dilation sizes for the residual blocks. + upsample_rates (list): Upsampling rates for the decoder. + upsample_initial_channel (int): Number of channels in the initial upsampling layer. + upsample_kernel_sizes (list): Kernel sizes for the upsampling layers. + spk_embed_dim (int): Dimension of the speaker embedding. + gin_channels (int): Number of channels in the global conditioning vector. + sr (int): Sampling rate of the audio. + use_f0 (bool): Whether to use F0 information. + text_enc_hidden_dim (int): Hidden dimension for the text encoder. + kwargs: Additional keyword arguments. + """ + + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + use_f0, + text_enc_hidden_dim=768, + **kwargs + ): + super(Synthesizer, self).__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = float(p_dropout) + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.spk_embed_dim = spk_embed_dim + self.use_f0 = use_f0 + + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + float(p_dropout), + text_enc_hidden_dim, + f0=use_f0, + ) + + if use_f0: + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + else: + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels) + + def remove_weight_norm(self): + """Removes weight normalization from the model.""" + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def __prepare_scriptable__(self): + for hook in self.dec._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.dec) + for hook in self.flow._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.flow) + if hasattr(self, "enc_q"): + for hook in self.enc_q._forward_pre_hooks.values(): + if ( + hook.__module__ == "torch.nn.utils.parametrizations.weight_norm" + and hook.__class__.__name__ == "WeightNorm" + ): + torch.nn.utils.remove_weight_norm(self.enc_q) + return self + + @torch.jit.ignore + def forward( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: Optional[torch.Tensor] = None, + pitchf: Optional[torch.Tensor] = None, + y: torch.Tensor = None, + y_lengths: torch.Tensor = None, + ds: Optional[torch.Tensor] = None, + ): + """ + Forward pass of the model. + + Args: + phone (torch.Tensor): Phoneme sequence. + phone_lengths (torch.Tensor): Lengths of the phoneme sequences. + pitch (torch.Tensor, optional): Pitch sequence. + pitchf (torch.Tensor, optional): Fine-grained pitch sequence. + y (torch.Tensor, optional): Target spectrogram. + y_lengths (torch.Tensor, optional): Lengths of the target spectrograms. + ds (torch.Tensor, optional): Speaker embedding. Defaults to None. + """ + g = self.emb_g(ds).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + if y is not None: + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size) + if self.use_f0: + pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2) + o = self.dec(z_slice, pitchf, g=g) + else: + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + else: + return None, None, x_mask, None, (None, None, m_p, logs_p, None, None) + + @torch.jit.export + def infer( + self, + phone: torch.Tensor, + phone_lengths: torch.Tensor, + pitch: Optional[torch.Tensor] = None, + nsff0: Optional[torch.Tensor] = None, + sid: torch.Tensor = None, + rate: Optional[torch.Tensor] = None, + ): + """ + Inference of the model. + + Args: + phone (torch.Tensor): Phoneme sequence. + phone_lengths (torch.Tensor): Lengths of the phoneme sequences. + pitch (torch.Tensor, optional): Pitch sequence. + nsff0 (torch.Tensor, optional): Fine-grained pitch sequence. + sid (torch.Tensor): Speaker embedding. + rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None. + """ + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate is not None: + assert isinstance(rate, torch.Tensor) + head = int(z_p.shape[2] * (1.0 - rate.item())) + z_p = z_p[:, :, head:] + x_mask = x_mask[:, :, head:] + if self.use_f0: + nsff0 = nsff0[:, head:] + if self.use_f0: + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + else: + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/programs/applio_code/rvc/lib/predictors/F0Extractor.py b/programs/applio_code/rvc/lib/predictors/F0Extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..d5cfb6a549bf7fd50d2375b1b9200211ca1b5654 --- /dev/null +++ b/programs/applio_code/rvc/lib/predictors/F0Extractor.py @@ -0,0 +1,107 @@ +import dataclasses +import pathlib +import libf0 +import librosa +import numpy as np +import resampy +import torch +import torchcrepe +import torchfcpe +import os + +# from tools.anyf0.rmvpe import RMVPE +from programs.applio_code.rvc.lib.predictors.RMVPE import RMVPE0Predictor +from programs.applio_code.rvc.configs.config import Config + +config = Config() + + +@dataclasses.dataclass +class F0Extractor: + wav_path: pathlib.Path + sample_rate: int = 44100 + hop_length: int = 512 + f0_min: int = 50 + f0_max: int = 1600 + method: str = "rmvpe" + x: np.ndarray = dataclasses.field(init=False) + + def __post_init__(self): + self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate) + + @property + def hop_size(self) -> float: + return self.hop_length / self.sample_rate + + @property + def wav16k(self) -> np.ndarray: + return resampy.resample(self.x, self.sample_rate, 16000) + + def extract_f0(self) -> np.ndarray: + f0 = None + method = self.method + # Fall back to CPU for ZLUDA as these methods use CUcFFT + device = ( + "cpu" + if "cuda" in config.device + and torch.cuda.get_device_name().endswith("[ZLUDA]") + else config.device + ) + + if method == "crepe": + wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(device) + f0 = torchcrepe.predict( + wav16k_torch, + sample_rate=16000, + hop_length=160, + batch_size=512, + fmin=self.f0_min, + fmax=self.f0_max, + device=device, + ) + f0 = f0[0].cpu().numpy() + elif method == "fcpe": + audio = librosa.to_mono(self.x) + audio_length = len(audio) + f0_target_length = (audio_length // self.hop_length) + 1 + audio = ( + torch.from_numpy(audio).float().unsqueeze(0).unsqueeze(-1).to(device) + ) + model = torchfcpe.spawn_bundled_infer_model(device=device) + + f0 = model.infer( + audio, + sr=self.sample_rate, + decoder_mode="local_argmax", + threshold=0.006, + f0_min=self.f0_min, + f0_max=self.f0_max, + interp_uv=False, + output_interp_target_length=f0_target_length, + ) + f0 = f0.squeeze().cpu().numpy() + elif method == "rmvpe": + is_half = False if device == "cpu" else config.is_half + model_rmvpe = RMVPE0Predictor( + os.path.join( + "programs", "applio_code", "rvc", "models", "predictors", "rmvpe.pt" + ), + is_half=is_half, + device=device, + # hop_length=80 + ) + f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03) + + else: + raise ValueError(f"Unknown method: {self.method}") + return libf0.hz_to_cents(f0, librosa.midi_to_hz(0)) + + def plot_f0(self, f0): + from matplotlib import pyplot as plt + + plt.figure(figsize=(10, 4)) + plt.plot(f0) + plt.title(self.method) + plt.xlabel("Time (frames)") + plt.ylabel("F0 (cents)") + plt.show() diff --git a/programs/applio_code/rvc/lib/predictors/FCPE.py b/programs/applio_code/rvc/lib/predictors/FCPE.py new file mode 100644 index 0000000000000000000000000000000000000000..12f6c346aa5d448a2133400a09e103043b5863c8 --- /dev/null +++ b/programs/applio_code/rvc/lib/predictors/FCPE.py @@ -0,0 +1,920 @@ +from typing import Union + +import torch.nn.functional as F +import numpy as np +import torch +import torch.nn as nn +from torch.nn.utils.parametrizations import weight_norm +from torchaudio.transforms import Resample +import os +import librosa +import soundfile as sf +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn +import math +from functools import partial + +from einops import rearrange, repeat +from local_attention import LocalAttention +from torch import nn + +os.environ["LRU_CACHE_CAPACITY"] = "3" + + +def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False): + """Loads wav file to torch tensor.""" + try: + data, sample_rate = sf.read(full_path, always_2d=True) + except Exception as error: + print(f"An error occurred loading {full_path}: {error}") + if return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + else: + raise + + data = data[:, 0] if len(data.shape) > 1 else data + assert len(data) > 2 + + # Normalize data + max_mag = ( + -np.iinfo(data.dtype).min + if np.issubdtype(data.dtype, np.integer) + else max(np.amax(data), -np.amin(data)) + ) + max_mag = ( + (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0) + ) + data = torch.FloatTensor(data.astype(np.float32)) / max_mag + + # Handle exceptions and resample + if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: + return [], sample_rate or target_sr or 48000 + if target_sr is not None and sample_rate != target_sr: + data = torch.from_numpy( + librosa.core.resample( + data.numpy(), orig_sr=sample_rate, target_sr=target_sr + ) + ) + sample_rate = target_sr + + return data, sample_rate + + +def dynamic_range_compression(x, C=1, clip_val=1e-5): + return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) + + +def dynamic_range_decompression(x, C=1): + return np.exp(x) / C + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + return torch.exp(x) / C + + +class STFT: + def __init__( + self, + sr=22050, + n_mels=80, + n_fft=1024, + win_size=1024, + hop_length=256, + fmin=20, + fmax=11025, + clip_val=1e-5, + ): + self.target_sr = sr + self.n_mels = n_mels + self.n_fft = n_fft + self.win_size = win_size + self.hop_length = hop_length + self.fmin = fmin + self.fmax = fmax + self.clip_val = clip_val + self.mel_basis = {} + self.hann_window = {} + + def get_mel(self, y, keyshift=0, speed=1, center=False, train=False): + sample_rate = self.target_sr + n_mels = self.n_mels + n_fft = self.n_fft + win_size = self.win_size + hop_length = self.hop_length + fmin = self.fmin + fmax = self.fmax + clip_val = self.clip_val + + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(n_fft * factor)) + win_size_new = int(np.round(win_size * factor)) + hop_length_new = int(np.round(hop_length * speed)) + + # Optimize mel_basis and hann_window caching + mel_basis = self.mel_basis if not train else {} + hann_window = self.hann_window if not train else {} + + mel_basis_key = str(fmax) + "_" + str(y.device) + if mel_basis_key not in mel_basis: + mel = librosa_mel_fn( + sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax + ) + mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device) + + keyshift_key = str(keyshift) + "_" + str(y.device) + if keyshift_key not in hann_window: + hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device) + + # Padding and STFT + pad_left = (win_size_new - hop_length_new) // 2 + pad_right = max( + (win_size_new - hop_length_new + 1) // 2, + win_size_new - y.size(-1) - pad_left, + ) + mode = "reflect" if pad_right < y.size(-1) else "constant" + y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode) + y = y.squeeze(1) + + spec = torch.stft( + y, + n_fft_new, + hop_length=hop_length_new, + win_length=win_size_new, + window=hann_window[keyshift_key], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9)) + + # Handle keyshift and mel conversion + if keyshift != 0: + size = n_fft // 2 + 1 + resize = spec.size(1) + spec = ( + F.pad(spec, (0, 0, 0, size - resize)) + if resize < size + else spec[:, :size, :] + ) + spec = spec * win_size / win_size_new + spec = torch.matmul(mel_basis[mel_basis_key], spec) + spec = dynamic_range_compression_torch(spec, clip_val=clip_val) + return spec + + def __call__(self, audiopath): + audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr) + spect = self.get_mel(audio.unsqueeze(0)).squeeze(0) + return spect + + +stft = STFT() + + +def softmax_kernel( + data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None +): + b, h, *_ = data.shape + + # Normalize data + data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0 + + # Project data + ratio = projection_matrix.shape[0] ** -0.5 + projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h) + projection = projection.type_as(data) + data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection) + + # Calculate diagonal data + diag_data = data**2 + diag_data = torch.sum(diag_data, dim=-1) + diag_data = (diag_data / 2.0) * (data_normalizer**2) + diag_data = diag_data.unsqueeze(dim=-1) + + # Apply softmax + if is_query: + data_dash = ratio * ( + torch.exp( + data_dash + - diag_data + - torch.max(data_dash, dim=-1, keepdim=True).values + ) + + eps + ) + else: + data_dash = ratio * (torch.exp(data_dash - diag_data + eps)) + + return data_dash.type_as(data) + + +def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None): + unstructured_block = torch.randn((cols, cols), device=device) + q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced") + q, r = map(lambda t: t.to(device), (q, r)) + + if qr_uniform_q: + d = torch.diag(r, 0) + q *= d.sign() + return q.t() + + +def exists(val): + return val is not None + + +def empty(tensor): + return tensor.numel() == 0 + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val): + return (val,) if not isinstance(val, tuple) else val + + +class PCmer(nn.Module): + def __init__( + self, + num_layers, + num_heads, + dim_model, + dim_keys, + dim_values, + residual_dropout, + attention_dropout, + ): + super().__init__() + self.num_layers = num_layers + self.num_heads = num_heads + self.dim_model = dim_model + self.dim_values = dim_values + self.dim_keys = dim_keys + self.residual_dropout = residual_dropout + self.attention_dropout = attention_dropout + + self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)]) + + def forward(self, phone, mask=None): + for layer in self._layers: + phone = layer(phone, mask) + return phone + + +class _EncoderLayer(nn.Module): + def __init__(self, parent: PCmer): + super().__init__() + self.conformer = ConformerConvModule(parent.dim_model) + self.norm = nn.LayerNorm(parent.dim_model) + self.dropout = nn.Dropout(parent.residual_dropout) + self.attn = SelfAttention( + dim=parent.dim_model, heads=parent.num_heads, causal=False + ) + + def forward(self, phone, mask=None): + phone = phone + (self.attn(self.norm(phone), mask=mask)) + phone = phone + (self.conformer(phone)) + return phone + + +def calc_same_padding(kernel_size): + pad = kernel_size // 2 + return (pad, pad - (kernel_size + 1) % 2) + + +class Swish(nn.Module): + def forward(self, x): + return x * x.sigmoid() + + +class Transpose(nn.Module): + def __init__(self, dims): + super().__init__() + assert len(dims) == 2, "dims must be a tuple of two dimensions" + self.dims = dims + + def forward(self, x): + return x.transpose(*self.dims) + + +class GLU(nn.Module): + def __init__(self, dim): + super().__init__() + self.dim = dim + + def forward(self, x): + out, gate = x.chunk(2, dim=self.dim) + return out * gate.sigmoid() + + +class DepthWiseConv1d(nn.Module): + def __init__(self, chan_in, chan_out, kernel_size, padding): + super().__init__() + self.padding = padding + self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in) + + def forward(self, x): + x = F.pad(x, self.padding) + return self.conv(x) + + +class ConformerConvModule(nn.Module): + def __init__( + self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0 + ): + super().__init__() + + inner_dim = dim * expansion_factor + padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0) + + self.net = nn.Sequential( + nn.LayerNorm(dim), + Transpose((1, 2)), + nn.Conv1d(dim, inner_dim * 2, 1), + GLU(dim=1), + DepthWiseConv1d( + inner_dim, inner_dim, kernel_size=kernel_size, padding=padding + ), + Swish(), + nn.Conv1d(inner_dim, dim, 1), + Transpose((1, 2)), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +def linear_attention(q, k, v): + if v is None: + out = torch.einsum("...ed,...nd->...ne", k, q) + return out + else: + k_cumsum = k.sum(dim=-2) + D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8) + context = torch.einsum("...nd,...ne->...de", k, v) + out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv) + return out + + +def gaussian_orthogonal_random_matrix( + nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None +): + nb_full_blocks = int(nb_rows / nb_columns) + block_list = [] + + for _ in range(nb_full_blocks): + q = orthogonal_matrix_chunk( + nb_columns, qr_uniform_q=qr_uniform_q, device=device + ) + block_list.append(q) + + remaining_rows = nb_rows - nb_full_blocks * nb_columns + if remaining_rows > 0: + q = orthogonal_matrix_chunk( + nb_columns, qr_uniform_q=qr_uniform_q, device=device + ) + block_list.append(q[:remaining_rows]) + + final_matrix = torch.cat(block_list) + + if scaling == 0: + multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1) + elif scaling == 1: + multiplier = math.sqrt((float(nb_columns))) * torch.ones( + (nb_rows,), device=device + ) + else: + raise ValueError(f"Invalid scaling {scaling}") + + return torch.diag(multiplier) @ final_matrix + + +class FastAttention(nn.Module): + def __init__( + self, + dim_heads, + nb_features=None, + ortho_scaling=0, + causal=False, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + no_projection=False, + ): + super().__init__() + nb_features = default(nb_features, int(dim_heads * math.log(dim_heads))) + + self.dim_heads = dim_heads + self.nb_features = nb_features + self.ortho_scaling = ortho_scaling + + self.create_projection = partial( + gaussian_orthogonal_random_matrix, + nb_rows=self.nb_features, + nb_columns=dim_heads, + scaling=ortho_scaling, + qr_uniform_q=qr_uniform_q, + ) + projection_matrix = self.create_projection() + self.register_buffer("projection_matrix", projection_matrix) + + self.generalized_attention = generalized_attention + self.kernel_fn = kernel_fn + self.no_projection = no_projection + self.causal = causal + + @torch.no_grad() + def redraw_projection_matrix(self): + projections = self.create_projection() + self.projection_matrix.copy_(projections) + del projections + + def forward(self, q, k, v): + device = q.device + + if self.no_projection: + q = q.softmax(dim=-1) + k = torch.exp(k) if self.causal else k.softmax(dim=-2) + else: + create_kernel = partial( + softmax_kernel, projection_matrix=self.projection_matrix, device=device + ) + q = create_kernel(q, is_query=True) + k = create_kernel(k, is_query=False) + + attn_fn = linear_attention if not self.causal else self.causal_linear_fn + + if v is None: + out = attn_fn(q, k, None) + return out + else: + out = attn_fn(q, k, v) + return out + + +class SelfAttention(nn.Module): + def __init__( + self, + dim, + causal=False, + heads=8, + dim_head=64, + local_heads=0, + local_window_size=256, + nb_features=None, + feature_redraw_interval=1000, + generalized_attention=False, + kernel_fn=nn.ReLU(), + qr_uniform_q=False, + dropout=0.0, + no_projection=False, + ): + super().__init__() + assert dim % heads == 0, "dimension must be divisible by number of heads" + dim_head = default(dim_head, dim // heads) + inner_dim = dim_head * heads + self.fast_attention = FastAttention( + dim_head, + nb_features, + causal=causal, + generalized_attention=generalized_attention, + kernel_fn=kernel_fn, + qr_uniform_q=qr_uniform_q, + no_projection=no_projection, + ) + + self.heads = heads + self.global_heads = heads - local_heads + self.local_attn = ( + LocalAttention( + window_size=local_window_size, + causal=causal, + autopad=True, + dropout=dropout, + look_forward=int(not causal), + rel_pos_emb_config=(dim_head, local_heads), + ) + if local_heads > 0 + else None + ) + + self.to_q = nn.Linear(dim, inner_dim) + self.to_k = nn.Linear(dim, inner_dim) + self.to_v = nn.Linear(dim, inner_dim) + self.to_out = nn.Linear(inner_dim, dim) + self.dropout = nn.Dropout(dropout) + + @torch.no_grad() + def redraw_projection_matrix(self): + self.fast_attention.redraw_projection_matrix() + + def forward( + self, + x, + context=None, + mask=None, + context_mask=None, + name=None, + inference=False, + **kwargs, + ): + _, _, _, h, gh = *x.shape, self.heads, self.global_heads + + cross_attend = exists(context) + context = default(context, x) + context_mask = default(context_mask, mask) if not cross_attend else context_mask + q, k, v = self.to_q(x), self.to_k(context), self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v)) + (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v)) + + attn_outs = [] + if not empty(q): + if exists(context_mask): + global_mask = context_mask[:, None, :, None] + v.masked_fill_(~global_mask, 0.0) + if cross_attend: + pass # TODO: Implement cross-attention + else: + out = self.fast_attention(q, k, v) + attn_outs.append(out) + + if not empty(lq): + assert ( + not cross_attend + ), "local attention is not compatible with cross attention" + out = self.local_attn(lq, lk, lv, input_mask=mask) + attn_outs.append(out) + + out = torch.cat(attn_outs, dim=1) + out = rearrange(out, "b h n d -> b n (h d)") + out = self.to_out(out) + return self.dropout(out) + + +def l2_regularization(model, l2_alpha): + l2_loss = [] + for module in model.modules(): + if type(module) is nn.Conv2d: + l2_loss.append((module.weight**2).sum() / 2.0) + return l2_alpha * sum(l2_loss) + + +class FCPE(nn.Module): + def __init__( + self, + input_channel=128, + out_dims=360, + n_layers=12, + n_chans=512, + use_siren=False, + use_full=False, + loss_mse_scale=10, + loss_l2_regularization=False, + loss_l2_regularization_scale=1, + loss_grad1_mse=False, + loss_grad1_mse_scale=1, + f0_max=1975.5, + f0_min=32.70, + confidence=False, + threshold=0.05, + use_input_conv=True, + ): + super().__init__() + if use_siren is True: + raise ValueError("Siren is not supported yet.") + if use_full is True: + raise ValueError("Full model is not supported yet.") + + self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10 + self.loss_l2_regularization = ( + loss_l2_regularization if (loss_l2_regularization is not None) else False + ) + self.loss_l2_regularization_scale = ( + loss_l2_regularization_scale + if (loss_l2_regularization_scale is not None) + else 1 + ) + self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False + self.loss_grad1_mse_scale = ( + loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1 + ) + self.f0_max = f0_max if (f0_max is not None) else 1975.5 + self.f0_min = f0_min if (f0_min is not None) else 32.70 + self.confidence = confidence if (confidence is not None) else False + self.threshold = threshold if (threshold is not None) else 0.05 + self.use_input_conv = use_input_conv if (use_input_conv is not None) else True + + self.cent_table_b = torch.Tensor( + np.linspace( + self.f0_to_cent(torch.Tensor([f0_min]))[0], + self.f0_to_cent(torch.Tensor([f0_max]))[0], + out_dims, + ) + ) + self.register_buffer("cent_table", self.cent_table_b) + + # conv in stack + _leaky = nn.LeakyReLU() + self.stack = nn.Sequential( + nn.Conv1d(input_channel, n_chans, 3, 1, 1), + nn.GroupNorm(4, n_chans), + _leaky, + nn.Conv1d(n_chans, n_chans, 3, 1, 1), + ) + + # transformer + self.decoder = PCmer( + num_layers=n_layers, + num_heads=8, + dim_model=n_chans, + dim_keys=n_chans, + dim_values=n_chans, + residual_dropout=0.1, + attention_dropout=0.1, + ) + self.norm = nn.LayerNorm(n_chans) + + # out + self.n_out = out_dims + self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out)) + + def forward( + self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax" + ): + if cdecoder == "argmax": + self.cdecoder = self.cents_decoder + elif cdecoder == "local_argmax": + self.cdecoder = self.cents_local_decoder + + x = ( + self.stack(mel.transpose(1, 2)).transpose(1, 2) + if self.use_input_conv + else mel + ) + x = self.decoder(x) + x = self.norm(x) + x = self.dense_out(x) + x = torch.sigmoid(x) + + if not infer: + gt_cent_f0 = self.f0_to_cent(gt_f0) + gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0) + loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0) + if self.loss_l2_regularization: + loss_all = loss_all + l2_regularization( + model=self, l2_alpha=self.loss_l2_regularization_scale + ) + x = loss_all + if infer: + x = self.cdecoder(x) + x = self.cent_to_f0(x) + x = (1 + x / 700).log() if not return_hz_f0 else x + + return x + + def cents_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum( + y, dim=-1, keepdim=True + ) + if mask: + confident = torch.max(y, dim=-1, keepdim=True)[0] + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cents_local_decoder(self, y, mask=True): + B, N, _ = y.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + confident, max_index = torch.max(y, dim=-1, keepdim=True) + local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4) + local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1) + ci_l = torch.gather(ci, -1, local_argmax_index) + y_l = torch.gather(y, -1, local_argmax_index) + rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum( + y_l, dim=-1, keepdim=True + ) + if mask: + confident_mask = torch.ones_like(confident) + confident_mask[confident <= self.threshold] = float("-INF") + rtn = rtn * confident_mask + return (rtn, confident) if self.confidence else rtn + + def cent_to_f0(self, cent): + return 10.0 * 2 ** (cent / 1200.0) + + def f0_to_cent(self, f0): + return 1200.0 * torch.log2(f0 / 10.0) + + def gaussian_blurred_cent(self, cents): + mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0))) + B, N, _ = cents.size() + ci = self.cent_table[None, None, :].expand(B, N, -1) + return torch.exp(-torch.square(ci - cents) / 1250) * mask.float() + + +class FCPEInfer: + def __init__(self, model_path, device=None, dtype=torch.float32): + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + ckpt = torch.load(model_path, map_location=torch.device(self.device)) + self.args = DotDict(ckpt["config"]) + self.dtype = dtype + model = FCPE( + input_channel=self.args.model.input_channel, + out_dims=self.args.model.out_dims, + n_layers=self.args.model.n_layers, + n_chans=self.args.model.n_chans, + use_siren=self.args.model.use_siren, + use_full=self.args.model.use_full, + loss_mse_scale=self.args.loss.loss_mse_scale, + loss_l2_regularization=self.args.loss.loss_l2_regularization, + loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale, + loss_grad1_mse=self.args.loss.loss_grad1_mse, + loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale, + f0_max=self.args.model.f0_max, + f0_min=self.args.model.f0_min, + confidence=self.args.model.confidence, + ) + model.to(self.device).to(self.dtype) + model.load_state_dict(ckpt["model"]) + model.eval() + self.model = model + self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device) + + @torch.no_grad() + def __call__(self, audio, sr, threshold=0.05): + self.model.threshold = threshold + audio = audio[None, :] + mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype) + f0 = self.model(mel=mel, infer=True, return_hz_f0=True) + return f0 + + +class Wav2Mel: + def __init__(self, args, device=None, dtype=torch.float32): + self.sample_rate = args.mel.sampling_rate + self.hop_size = args.mel.hop_size + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.dtype = dtype + self.stft = STFT( + args.mel.sampling_rate, + args.mel.num_mels, + args.mel.n_fft, + args.mel.win_size, + args.mel.hop_size, + args.mel.fmin, + args.mel.fmax, + ) + self.resample_kernel = {} + + def extract_nvstft(self, audio, keyshift=0, train=False): + mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2) + return mel + + def extract_mel(self, audio, sample_rate, keyshift=0, train=False): + audio = audio.to(self.dtype).to(self.device) + if sample_rate == self.sample_rate: + audio_res = audio + else: + key_str = str(sample_rate) + if key_str not in self.resample_kernel: + self.resample_kernel[key_str] = Resample( + sample_rate, self.sample_rate, lowpass_filter_width=128 + ) + self.resample_kernel[key_str] = ( + self.resample_kernel[key_str].to(self.dtype).to(self.device) + ) + audio_res = self.resample_kernel[key_str](audio) + + mel = self.extract_nvstft( + audio_res, keyshift=keyshift, train=train + ) # B, n_frames, bins + n_frames = int(audio.shape[1] // self.hop_size) + 1 + mel = ( + torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel + ) + mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel + return mel + + def __call__(self, audio, sample_rate, keyshift=0, train=False): + return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train) + + +class DotDict(dict): + def __getattr__(*args): + val = dict.get(*args) + return DotDict(val) if type(val) is dict else val + + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + +class F0Predictor(object): + def compute_f0(self, wav, p_len): + pass + + def compute_f0_uv(self, wav, p_len): + pass + + +class FCPEF0Predictor(F0Predictor): + def __init__( + self, + model_path, + hop_length=512, + f0_min=50, + f0_max=1100, + dtype=torch.float32, + device=None, + sample_rate=44100, + threshold=0.05, + ): + self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype) + self.hop_length = hop_length + self.f0_min = f0_min + self.f0_max = f0_max + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.threshold = threshold + self.sample_rate = sample_rate + self.dtype = dtype + self.name = "fcpe" + + def repeat_expand( + self, + content: Union[torch.Tensor, np.ndarray], + target_len: int, + mode: str = "nearest", + ): + ndim = content.ndim + content = ( + content[None, None] + if ndim == 1 + else content[None] if ndim == 2 else content + ) + assert content.ndim == 3 + is_np = isinstance(content, np.ndarray) + content = torch.from_numpy(content) if is_np else content + results = torch.nn.functional.interpolate(content, size=target_len, mode=mode) + results = results.numpy() if is_np else results + return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results + + def post_process(self, x, sample_rate, f0, pad_to): + f0 = ( + torch.from_numpy(f0).float().to(x.device) + if isinstance(f0, np.ndarray) + else f0 + ) + f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0 + + vuv_vector = torch.zeros_like(f0) + vuv_vector[f0 > 0.0] = 1.0 + vuv_vector[f0 <= 0.0] = 0.0 + + nzindex = torch.nonzero(f0).squeeze() + f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy() + time_org = self.hop_length / sample_rate * nzindex.cpu().numpy() + time_frame = np.arange(pad_to) * self.hop_length / sample_rate + + vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0] + + if f0.shape[0] <= 0: + return np.zeros(pad_to), vuv_vector.cpu().numpy() + if f0.shape[0] == 1: + return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy() + + f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) + return f0, vuv_vector.cpu().numpy() + + def compute_f0(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len), ( + f0.cpu().numpy() if p_len is None else np.zeros(p_len) + ) + return self.post_process(x, self.sample_rate, f0, p_len)[0] + + def compute_f0_uv(self, wav, p_len=None): + x = torch.FloatTensor(wav).to(self.dtype).to(self.device) + p_len = x.shape[0] // self.hop_length if p_len is None else p_len + f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0] + if torch.all(f0 == 0): + return f0.cpu().numpy() if p_len is None else np.zeros(p_len), ( + f0.cpu().numpy() if p_len is None else np.zeros(p_len) + ) + return self.post_process(x, self.sample_rate, f0, p_len) diff --git a/programs/applio_code/rvc/lib/predictors/RMVPE.py b/programs/applio_code/rvc/lib/predictors/RMVPE.py new file mode 100644 index 0000000000000000000000000000000000000000..7e9f6ddd4bb061164c2910e8c4216ef51acd3503 --- /dev/null +++ b/programs/applio_code/rvc/lib/predictors/RMVPE.py @@ -0,0 +1,569 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np + +from librosa.filters import mel +from typing import List + +# Constants for readability +N_MELS = 128 +N_CLASS = 360 + + +# Define a helper function for creating convolutional blocks +class ConvBlockRes(nn.Module): + """ + A convolutional block with residual connection. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +# Define a class for residual encoder blocks +class ResEncoderBlock(nn.Module): + """ + A residual encoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + """ + + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +# Define a class for the encoder +class Encoder(nn.Module): + """ + The encoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + in_size (int): Size of the input tensor. + n_encoders (int): Number of encoder blocks. + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder block. + out_channels (int): Number of output channels for the first encoder block. + momentum (float): Momentum for batch normalization. + """ + + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x: torch.Tensor): + concat_tensors: List[torch.Tensor] = [] + x = self.bn(x) + for i in range(self.n_encoders): + t, x = self.layers[i](x) + concat_tensors.append(t) + return x, concat_tensors + + +# Define a class for the intermediate layer +class Intermediate(nn.Module): + """ + The intermediate layer of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + n_inters (int): Number of convolutional blocks in the intermediate layer. + n_blocks (int): Number of convolutional blocks in each intermediate block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for _ in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +# Define a class for residual decoder blocks +class ResDecoderBlock(nn.Module): + """ + A residual decoder block. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in the block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +# Define a class for the decoder +class Decoder(nn.Module): + """ + The decoder part of the DeepUnet. + + Args: + in_channels (int): Number of input channels. + n_decoders (int): Number of decoder blocks. + stride (tuple): Stride for transposed convolution. + n_blocks (int): Number of convolutional blocks in each decoder block. + momentum (float): Momentum for batch normalization. + """ + + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for _ in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +# Define a class for the DeepUnet architecture +class DeepUnet(nn.Module): + """ + The DeepUnet architecture. + + Args: + kernel_size (tuple): Size of the average pooling kernel. + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + """ + + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +# Define a class for the end-to-end model +class E2E(nn.Module): + """ + The end-to-end model. + + Args: + n_blocks (int): Number of convolutional blocks in each encoder/decoder block. + n_gru (int): Number of GRU layers. + kernel_size (tuple): Size of the average pooling kernel. + en_de_layers (int): Number of encoder/decoder layers. + inter_layers (int): Number of convolutional blocks in the intermediate layer. + in_channels (int): Number of input channels. + en_out_channels (int): Number of output channels for the first encoder block. + """ + + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, N_CLASS), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +# Define a class for the MelSpectrogram extractor +class MelSpectrogram(torch.nn.Module): + """ + Extracts Mel-spectrogram features from audio. + + Args: + is_half (bool): Whether to use half-precision floating-point numbers. + n_mel_channels (int): Number of Mel-frequency bands. + sample_rate (int): Sampling rate of the audio. + win_length (int): Length of the window function in samples. + hop_length (int): Hop size between frames in samples. + n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length. + mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0. + mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None. + clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5. + """ + + def __init__( + self, + is_half, + n_mel_channels, + sample_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sample_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sample_rate = sample_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + + # Zluda, fall-back to CPU for FFTs since HIP SDK has no cuFFT alternative + source_device = audio.device + if audio.device.type == "cuda" and torch.cuda.get_device_name().endswith( + "[ZLUDA]" + ): + audio = audio.to("cpu") + self.hann_window[keyshift_key] = self.hann_window[keyshift_key].to("cpu") + + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ).to(source_device) + + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +# Define a class for the RMVPE0 predictor +class RMVPE0Predictor: + """ + A predictor for fundamental frequency (F0) based on the RMVPE0 model. + + Args: + model_path (str): Path to the RMVPE0 model file. + is_half (bool): Whether to use half-precision floating-point numbers. + device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available. + """ + + def __init__(self, model_path, is_half, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half: + model = model.half() + self.model = model + self.resample_kernel = {} + self.is_half = is_half + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, N_MELS, 16000, 1024, 160, None, 30, 8000 + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) + + def mel2hidden(self, mel): + """ + Converts Mel-spectrogram features to hidden representation. + + Args: + mel (torch.Tensor): Mel-spectrogram features. + """ + with torch.no_grad(): + n_frames = mel.shape[-1] + mel = F.pad( + mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" + ) + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + """ + Decodes hidden representation to F0. + + Args: + hidden (np.ndarray): Hidden representation. + thred (float, optional): Threshold for salience. Defaults to 0.03. + """ + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + return f0 + + def infer_from_audio(self, audio, thred=0.03): + """ + Infers F0 from audio. + + Args: + audio (np.ndarray): Audio signal. + thred (float, optional): Threshold for salience. Defaults to 0.03. + """ + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + mel = self.mel_extractor(audio, center=True) + hidden = self.mel2hidden(mel) + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + """ + Converts salience to local average cents. + + Args: + salience (np.ndarray): Salience values. + thred (float, optional): Threshold for salience. Defaults to 0.05. + """ + center = np.argmax(salience, axis=1) + salience = np.pad(salience, ((0, 0), (4, 4))) + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + todo_salience = np.array(todo_salience) + todo_cents_mapping = np.array(todo_cents_mapping) + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) + devided = product_sum / weight_sum + maxx = np.max(salience, axis=1) + devided[maxx <= thred] = 0 + return devided + + +# Define a class for BiGRU (bidirectional GRU) +class BiGRU(nn.Module): + """ + A bidirectional GRU layer. + + Args: + input_features (int): Number of input features. + hidden_features (int): Number of hidden features. + num_layers (int): Number of GRU layers. + """ + + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] diff --git a/programs/applio_code/rvc/lib/tools/analyzer.py b/programs/applio_code/rvc/lib/tools/analyzer.py new file mode 100644 index 0000000000000000000000000000000000000000..f4b794348082b168132dda0a23933c6d633f0097 --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/analyzer.py @@ -0,0 +1,76 @@ +import numpy as np +import matplotlib.pyplot as plt +import librosa.display +import librosa + + +def calculate_features(y, sr): + stft = np.abs(librosa.stft(y)) + duration = librosa.get_duration(y=y, sr=sr) + cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0] + bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0] + rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0] + return stft, duration, cent, bw, rolloff + + +def plot_title(title): + plt.suptitle(title, fontsize=16, fontweight="bold") + + +def plot_spectrogram(y, sr, stft, duration, cmap="inferno"): + plt.subplot(3, 1, 1) + plt.imshow( + librosa.amplitude_to_db(stft, ref=np.max), + origin="lower", + extent=[0, duration, 0, sr / 1000], + aspect="auto", + cmap=cmap, # Change the colormap here + ) + plt.colorbar(format="%+2.0f dB") + plt.xlabel("Time (s)") + plt.ylabel("Frequency (kHz)") + plt.title("Spectrogram") + + +def plot_waveform(y, sr, duration): + plt.subplot(3, 1, 2) + librosa.display.waveshow(y, sr=sr) + plt.xlabel("Time (s)") + plt.ylabel("Amplitude") + plt.title("Waveform") + + +def plot_features(times, cent, bw, rolloff, duration): + plt.subplot(3, 1, 3) + plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b") + plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g") + plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r") + plt.xlabel("Time (s)") + plt.title("Spectral Features") + plt.legend() + + +def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"): + y, sr = librosa.load(audio_file) + stft, duration, cent, bw, rolloff = calculate_features(y, sr) + + plt.figure(figsize=(12, 10)) + + plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1]) + plot_spectrogram(y, sr, stft, duration) + plot_waveform(y, sr, duration) + plot_features(librosa.times_like(cent), cent, bw, rolloff, duration) + + plt.tight_layout() + + if save_plot_path: + plt.savefig(save_plot_path, bbox_inches="tight", dpi=300) + plt.close() + + audio_info = f"""Sample Rate: {sr}\nDuration: {( + str(round(duration, 2)) + " seconds" + if duration < 60 + else str(round(duration / 60, 2)) + " minutes" + )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}""" + + return audio_info, save_plot_path diff --git a/programs/applio_code/rvc/lib/tools/gdown.py b/programs/applio_code/rvc/lib/tools/gdown.py new file mode 100644 index 0000000000000000000000000000000000000000..eb5ca071e52d3d48c58708ee2fbbeefb205827d3 --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/gdown.py @@ -0,0 +1,354 @@ +import os +import re +import six +import sys +import json +import tqdm +import time +import shutil +import warnings +import tempfile +import textwrap +import requests +from six.moves import urllib_parse + + +def indent(text, prefix): + """Indent each non-empty line of text with the given prefix.""" + return "".join( + (prefix + line if line.strip() else line) for line in text.splitlines(True) + ) + + +class FileURLRetrievalError(Exception): + pass + + +class FolderContentsMaximumLimitError(Exception): + pass + + +def parse_url(url, warning=True): + """Parse URLs especially for Google Drive links. + + Args: + url: URL to parse. + warning: Whether to warn if the URL is not a download link. + + Returns: + A tuple (file_id, is_download_link), where file_id is the ID of the + file on Google Drive, and is_download_link is a flag indicating + whether the URL is a download link. + """ + parsed = urllib_parse.urlparse(url) + query = urllib_parse.parse_qs(parsed.query) + is_gdrive = parsed.hostname in ("drive.google.com", "docs.google.com") + is_download_link = parsed.path.endswith("/uc") + + if not is_gdrive: + return None, is_download_link + + file_id = query.get("id", [None])[0] + if file_id is None: + for pattern in ( + r"^/file/d/(.*?)/(edit|view)$", + r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$", + r"^/document/d/(.*?)/(edit|htmlview|view)$", + r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", + r"^/presentation/d/(.*?)/(edit|htmlview|view)$", + r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", + r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$", + r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$", + ): + match = re.match(pattern, parsed.path) + if match: + file_id = match.group(1) + break + + if warning and not is_download_link: + warnings.warn( + "You specified a Google Drive link that is not the correct link " + "to download a file. You might want to try `--fuzzy` option " + f"or the following url: https://drive.google.com/uc?id={file_id}" + ) + + return file_id, is_download_link + + +CHUNK_SIZE = 512 * 1024 # 512KB +HOME = os.path.expanduser("~") + + +def get_url_from_gdrive_confirmation(contents): + """Extract the download URL from a Google Drive confirmation page.""" + for pattern in ( + r'href="(\/uc\?export=download[^"]+)', + r'href="/open\?id=([^"]+)"', + r'"downloadUrl":"([^"]+)', + ): + match = re.search(pattern, contents) + if match: + url = match.group(1) + if pattern == r'href="/open\?id=([^"]+)"': + uuid = re.search( + r'(.*)

', contents) + if match: + error = match.group(1) + raise FileURLRetrievalError(error) + + raise FileURLRetrievalError( + "Cannot retrieve the public link of the file. " + "You may need to change the permission to " + "'Anyone with the link', or have had many accesses." + ) + + +def _get_session(proxy, use_cookies, return_cookies_file=False): + """Create a requests session with optional proxy and cookie handling.""" + sess = requests.session() + sess.headers.update( + {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"} + ) + + if proxy is not None: + sess.proxies = {"http": proxy, "https": proxy} + print("Using proxy:", proxy, file=sys.stderr) + + cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json") + if os.path.exists(cookies_file) and use_cookies: + with open(cookies_file) as f: + cookies = json.load(f) + for k, v in cookies: + sess.cookies[k] = v + + return (sess, cookies_file) if return_cookies_file else sess + + +def download( + url=None, + output=None, + quiet=False, + proxy=None, + speed=None, + use_cookies=True, + verify=True, + id=None, + fuzzy=True, + resume=False, + format=None, +): + """Download file from URL. + + Parameters + ---------- + url: str + URL. Google Drive URL is also supported. + output: str + Output filename. Default is basename of URL. + quiet: bool + Suppress terminal output. Default is False. + proxy: str + Proxy. + speed: float + Download byte size per second (e.g., 256KB/s = 256 * 1024). + use_cookies: bool + Flag to use cookies. Default is True. + verify: bool or string + Either a bool, in which case it controls whether the server's TLS + certificate is verified, or a string, in which case it must be a path + to a CA bundle to use. Default is True. + id: str + Google Drive's file ID. + fuzzy: bool + Fuzzy extraction of Google Drive's file Id. Default is False. + resume: bool + Resume the download from existing tmp file if possible. + Default is False. + format: str, optional + Format of Google Docs, Spreadsheets and Slides. Default is: + - Google Docs: 'docx' + - Google Spreadsheet: 'xlsx' + - Google Slides: 'pptx' + + Returns + ------- + output: str + Output filename. + """ + if not (id is None) ^ (url is None): + raise ValueError("Either url or id has to be specified") + if id is not None: + url = f"https://drive.google.com/uc?id={id}" + + url_origin = url + + sess, cookies_file = _get_session( + proxy=proxy, use_cookies=use_cookies, return_cookies_file=True + ) + + gdrive_file_id, is_gdrive_download_link = parse_url(url, warning=not fuzzy) + + if fuzzy and gdrive_file_id: + # overwrite the url with fuzzy match of a file id + url = f"https://drive.google.com/uc?id={gdrive_file_id}" + url_origin = url + is_gdrive_download_link = True + + while True: + res = sess.get(url, stream=True, verify=verify) + + if url == url_origin and res.status_code == 500: + # The file could be Google Docs or Spreadsheets. + url = f"https://drive.google.com/open?id={gdrive_file_id}" + continue + + if res.headers["Content-Type"].startswith("text/html"): + title = re.search("(.+)", res.text) + if title: + title = title.group(1) + if title.endswith(" - Google Docs"): + url = f"https://docs.google.com/document/d/{gdrive_file_id}/export?format={'docx' if format is None else format}" + continue + if title.endswith(" - Google Sheets"): + url = f"https://docs.google.com/spreadsheets/d/{gdrive_file_id}/export?format={'xlsx' if format is None else format}" + continue + if title.endswith(" - Google Slides"): + url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}" + continue + elif ( + "Content-Disposition" in res.headers + and res.headers["Content-Disposition"].endswith("pptx") + and format not in (None, "pptx") + ): + url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}" + continue + + if use_cookies: + os.makedirs(os.path.dirname(cookies_file), exist_ok=True) + with open(cookies_file, "w") as f: + cookies = [ + (k, v) + for k, v in sess.cookies.items() + if not k.startswith("download_warning_") + ] + json.dump(cookies, f, indent=2) + + if "Content-Disposition" in res.headers: + # This is the file + break + if not (gdrive_file_id and is_gdrive_download_link): + break + + # Need to redirect with confirmation + try: + url = get_url_from_gdrive_confirmation(res.text) + except FileURLRetrievalError as e: + message = ( + "Failed to retrieve file url:\n\n" + "{}\n\n" + "You may still be able to access the file from the browser:" + f"\n\n\t{url_origin}\n\n" + "but Gdown can't. Please check connections and permissions." + ).format(indent("\n".join(textwrap.wrap(str(e))), prefix="\t")) + raise FileURLRetrievalError(message) + + if gdrive_file_id and is_gdrive_download_link: + content_disposition = urllib_parse.unquote(res.headers["Content-Disposition"]) + filename_from_url = ( + re.search(r"filename\*=UTF-8''(.*)", content_disposition) + or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition) + ).group(1) + filename_from_url = filename_from_url.replace(os.path.sep, "_") + else: + filename_from_url = os.path.basename(url) + + output = output or filename_from_url + + output_is_path = isinstance(output, six.string_types) + if output_is_path and output.endswith(os.path.sep): + os.makedirs(output, exist_ok=True) + output = os.path.join(output, filename_from_url) + + if output_is_path: + temp_dir = os.path.dirname(output) or "." + prefix = os.path.basename(output) + existing_tmp_files = [ + os.path.join(temp_dir, file) + for file in os.listdir(temp_dir) + if file.startswith(prefix) + ] + if resume and existing_tmp_files: + if len(existing_tmp_files) > 1: + print( + "There are multiple temporary files to resume:", + file=sys.stderr, + ) + for file in existing_tmp_files: + print(f"\t{file}", file=sys.stderr) + print( + "Please remove them except one to resume downloading.", + file=sys.stderr, + ) + return + tmp_file = existing_tmp_files[0] + else: + resume = False + tmp_file = tempfile.mktemp( + suffix=tempfile.template, prefix=prefix, dir=temp_dir + ) + f = open(tmp_file, "ab") + else: + tmp_file = None + f = output + + if tmp_file is not None and f.tell() != 0: + headers = {"Range": f"bytes={f.tell()}-"} + res = sess.get(url, headers=headers, stream=True, verify=verify) + + if not quiet: + if resume: + print("Resume:", tmp_file, file=sys.stderr) + print( + "To:", + os.path.abspath(output) if output_is_path else output, + file=sys.stderr, + ) + + try: + total = int(res.headers.get("Content-Length", 0)) + if not quiet: + pbar = tqdm.tqdm(total=total, unit="B", unit_scale=True) + t_start = time.time() + for chunk in res.iter_content(chunk_size=CHUNK_SIZE): + f.write(chunk) + if not quiet: + pbar.update(len(chunk)) + if speed is not None: + elapsed_time_expected = 1.0 * pbar.n / speed + elapsed_time = time.time() - t_start + if elapsed_time < elapsed_time_expected: + time.sleep(elapsed_time_expected - elapsed_time) + if not quiet: + pbar.close() + if tmp_file: + f.close() + shutil.move(tmp_file, output) + finally: + sess.close() + + return output diff --git a/programs/applio_code/rvc/lib/tools/launch_tensorboard.py b/programs/applio_code/rvc/lib/tools/launch_tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..7f74e316762b737037f7b8e4448a1042553d5651 --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/launch_tensorboard.py @@ -0,0 +1,21 @@ +import time +import logging +from tensorboard import program + +log_path = "logs" + + +def launch_tensorboard_pipeline(): + logging.getLogger("root").setLevel(logging.WARNING) + logging.getLogger("tensorboard").setLevel(logging.WARNING) + + tb = program.TensorBoard() + tb.configure(argv=[None, "--logdir", log_path]) + url = tb.launch() + + print( + f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D" + ) + + while True: + time.sleep(600) diff --git a/programs/applio_code/rvc/lib/tools/model_download.py b/programs/applio_code/rvc/lib/tools/model_download.py new file mode 100644 index 0000000000000000000000000000000000000000..6a72cbf14b87e786f04186c846f032eb0660343b --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/model_download.py @@ -0,0 +1,385 @@ +import os +import re +import six +import sys +import wget +import shutil +import zipfile +import requests +from bs4 import BeautifulSoup +from urllib.parse import unquote, urlencode, parse_qs, urlparse + +now_dir = os.getcwd() +sys.path.append(now_dir) + +from programs.applio_code.rvc.lib.utils import format_title +from programs.applio_code.rvc.lib.tools import gdown + + +def find_folder_parent(search_dir, folder_name): + for dirpath, dirnames, _ in os.walk(search_dir): + if folder_name in dirnames: + return os.path.abspath(dirpath) + return None + + +file_path = find_folder_parent(now_dir, "logs") +zips_path = os.path.join(file_path, "zips") + + +def search_pth_index(folder): + pth_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth") + ] + index_paths = [ + os.path.join(folder, file) + for file in os.listdir(folder) + if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index") + ] + + return pth_paths, index_paths + + +def get_mediafire_download_link(url): + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + download_button = soup.find( + "a", {"class": "input popsok", "aria-label": "Download file"} + ) + if download_button: + download_link = download_button.get("href") + return download_link + else: + return None + + +def download_from_url(url): + os.makedirs(zips_path, exist_ok=True) + if url != "": + if "drive.google.com" in url: + if "file/d/" in url: + file_id = url.split("file/d/")[1].split("/")[0] + elif "id=" in url: + file_id = url.split("id=")[1].split("&")[0] + else: + return None + + if file_id: + os.chdir(zips_path) + try: + gdown.download( + f"https://drive.google.com/uc?id={file_id}", + quiet=True, + fuzzy=True, + ) + except Exception as error: + error_message = str( + f"An error occurred downloading the file: {error}" + ) + if ( + "Too many users have viewed or downloaded this file recently" + in error_message + ): + os.chdir(now_dir) + return "too much use" + elif ( + "Cannot retrieve the public link of the file." in error_message + ): + os.chdir(now_dir) + return "private link" + else: + print(error_message) + os.chdir(now_dir) + return None + elif "disk.yandex.ru" in url: + base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?" + public_key = url + final_url = base_url + urlencode(dict(public_key=public_key)) + response = requests.get(final_url) + download_url = response.json()["href"] + download_response = requests.get(download_url) + + if download_response.status_code == 200: + filename = parse_qs(urlparse(unquote(download_url)).query).get( + "filename", [""] + )[0] + if filename: + os.chdir(zips_path) + with open(filename, "wb") as f: + f.write(download_response.content) + else: + print("Failed to get filename from URL.") + return None + + elif "pixeldrain.com" in url: + try: + file_id = url.split("pixeldrain.com/u/")[1] + os.chdir(zips_path) + print(file_id) + response = requests.get(f"https://pixeldrain.com/api/file/{file_id}") + if response.status_code == 200: + file_name = ( + response.headers.get("Content-Disposition") + .split("filename=")[-1] + .strip('";') + ) + os.makedirs(zips_path, exist_ok=True) + with open(os.path.join(zips_path, file_name), "wb") as newfile: + newfile.write(response.content) + os.chdir(file_path) + return "downloaded" + else: + os.chdir(file_path) + return None + except Exception as error: + print(f"An error occurred downloading the file: {error}") + os.chdir(file_path) + return None + + elif "cdn.discordapp.com" in url: + file = requests.get(url) + os.chdir(zips_path) + if file.status_code == 200: + name = url.split("/") + with open(os.path.join(name[-1]), "wb") as newfile: + newfile.write(file.content) + else: + return None + elif "/blob/" in url or "/resolve/" in url: + os.chdir(zips_path) + if "/blob/" in url: + url = url.replace("/blob/", "/resolve/") + + response = requests.get(url, stream=True) + if response.status_code == 200: + content_disposition = six.moves.urllib_parse.unquote( + response.headers["Content-Disposition"] + ) + m = re.search(r'filename="([^"]+)"', content_disposition) + file_name = m.groups()[0] + file_name = file_name.replace(os.path.sep, "_") + total_size_in_bytes = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar_length = 50 + progress = 0 + + with open(os.path.join(zips_path, file_name), "wb") as file: + for data in response.iter_content(block_size): + file.write(data) + progress += len(data) + progress_percent = int((progress / total_size_in_bytes) * 100) + num_dots = int( + (progress / total_size_in_bytes) * progress_bar_length + ) + progress_bar = ( + "[" + + "." * num_dots + + " " * (progress_bar_length - num_dots) + + "]" + ) + print( + f"{progress_percent}% {progress_bar} {progress}/{total_size_in_bytes} ", + end="\r", + ) + if progress_percent == 100: + print("\n") + + else: + os.chdir(now_dir) + return None + elif "/tree/main" in url: + os.chdir(zips_path) + response = requests.get(url) + soup = BeautifulSoup(response.content, "html.parser") + temp_url = "" + for link in soup.find_all("a", href=True): + if link["href"].endswith(".zip"): + temp_url = link["href"] + break + if temp_url: + url = temp_url + url = url.replace("blob", "resolve") + if "huggingface.co" not in url: + url = "https://huggingface.co" + url + + wget.download(url) + else: + os.chdir(now_dir) + return None + elif "applio.org" in url: + parts = url.split("/") + id_with_query = parts[-1] + id_parts = id_with_query.split("?") + id_number = id_parts[0] + + url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models" + headers = { + "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10" + } + + params = {"id": f"eq.{id_number}"} + response = requests.get(url, headers=headers, params=params) + if response.status_code == 200: + json_response = response.json() + print(json_response) + if json_response: + link = json_response[0]["link"] + verify = download_from_url(link) + if verify == "downloaded": + return "downloaded" + else: + return None + else: + return None + else: + try: + os.chdir(zips_path) + wget.download(url) + except Exception as error: + os.chdir(now_dir) + print(f"An error occurred downloading the file: {error}") + return None + + for currentPath, _, zipFiles in os.walk(zips_path): + for Files in zipFiles: + filePart = Files.split(".") + extensionFile = filePart[len(filePart) - 1] + filePart.pop() + nameFile = "_".join(filePart) + realPath = os.path.join(currentPath, Files) + os.rename(realPath, nameFile + "." + extensionFile) + + os.chdir(now_dir) + return "downloaded" + + os.chdir(now_dir) + return None + + +def extract_and_show_progress(zipfile_path, unzips_path): + try: + with zipfile.ZipFile(zipfile_path, "r") as zip_ref: + for file_info in zip_ref.infolist(): + zip_ref.extract(file_info, unzips_path) + os.remove(zipfile_path) + return True + except Exception as error: + print(f"An error occurred extracting the zip file: {error}") + return False + + +def unzip_file(zip_path, zip_file_name): + zip_file_path = os.path.join(zip_path, zip_file_name + ".zip") + extract_path = os.path.join(file_path, zip_file_name) + with zipfile.ZipFile(zip_file_path, "r") as zip_ref: + zip_ref.extractall(extract_path) + os.remove(zip_file_path) + + +def model_download_pipeline(url: str): + try: + verify = download_from_url(url) + if verify == "downloaded": + extract_folder_path = "" + for filename in os.listdir(zips_path): + if filename.endswith(".zip"): + zipfile_path = os.path.join(zips_path, filename) + print("Proceeding with the extraction...") + + model_zip = os.path.basename(zipfile_path) + model_name = format_title(model_zip.split(".zip")[0]) + extract_folder_path = os.path.join( + "logs", + os.path.normpath(model_name), + ) + success = extract_and_show_progress( + zipfile_path, extract_folder_path + ) + + macosx_path = os.path.join(extract_folder_path, "__MACOSX") + if os.path.exists(macosx_path): + shutil.rmtree(macosx_path) + + subfolders = [ + f + for f in os.listdir(extract_folder_path) + if os.path.isdir(os.path.join(extract_folder_path, f)) + ] + if len(subfolders) == 1: + subfolder_path = os.path.join( + extract_folder_path, subfolders[0] + ) + for item in os.listdir(subfolder_path): + s = os.path.join(subfolder_path, item) + d = os.path.join(extract_folder_path, item) + shutil.move(s, d) + os.rmdir(subfolder_path) + + for item in os.listdir(extract_folder_path): + if ".pth" in item: + file_name = item.split(".pth")[0] + if file_name != model_name: + os.rename( + os.path.join(extract_folder_path, item), + os.path.join( + extract_folder_path, model_name + ".pth" + ), + ) + else: + if "v2" not in item: + if "_nprobe_1_" in item and "_v1" in item: + file_name = item.split("_nprobe_1_")[1].split( + "_v1" + )[0] + if file_name != model_name: + new_file_name = ( + item.split("_nprobe_1_")[0] + + "_nprobe_1_" + + model_name + + "_v1" + ) + os.rename( + os.path.join(extract_folder_path, item), + os.path.join( + extract_folder_path, + new_file_name + ".index", + ), + ) + else: + if "_nprobe_1_" in item and "_v2" in item: + file_name = item.split("_nprobe_1_")[1].split( + "_v2" + )[0] + if file_name != model_name: + new_file_name = ( + item.split("_nprobe_1_")[0] + + "_nprobe_1_" + + model_name + + "_v2" + ) + os.rename( + os.path.join(extract_folder_path, item), + os.path.join( + extract_folder_path, + new_file_name + ".index", + ), + ) + + if success: + print(f"Model {model_name} downloaded!") + else: + print(f"Error downloading {model_name}") + return "Error" + if extract_folder_path == "": + print("Zip file was not found.") + return "Error" + result = search_pth_index(extract_folder_path) + return result + else: + return "Error" + except Exception as error: + print(f"An unexpected error occurred: {error}") + return "Error" diff --git a/programs/applio_code/rvc/lib/tools/prerequisites_download.py b/programs/applio_code/rvc/lib/tools/prerequisites_download.py new file mode 100644 index 0000000000000000000000000000000000000000..96587d1a5dc464e317a4b25c5115278c30640bfb --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/prerequisites_download.py @@ -0,0 +1,164 @@ +import os +from concurrent.futures import ThreadPoolExecutor +from tqdm import tqdm +import requests + +url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources" + +pretraineds_v1_list = [ + ( + "pretrained_v1/", + [ + "D32k.pth", + "D40k.pth", + "D48k.pth", + "G32k.pth", + "G40k.pth", + "G48k.pth", + "f0D32k.pth", + "f0D40k.pth", + "f0D48k.pth", + "f0G32k.pth", + "f0G40k.pth", + "f0G48k.pth", + ], + ) +] +pretraineds_v2_list = [ + ( + "pretrained_v2/", + [ + "D32k.pth", + "D40k.pth", + "D48k.pth", + "G32k.pth", + "G40k.pth", + "G48k.pth", + "f0D32k.pth", + "f0D40k.pth", + "f0D48k.pth", + "f0G32k.pth", + "f0G40k.pth", + "f0G48k.pth", + ], + ) +] +models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])] +embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])] +linux_executables_list = [("formant/", ["stftpitchshift"])] +executables_list = [ + ("", ["ffmpeg.exe", "ffprobe.exe"]), + ("formant/", ["stftpitchshift.exe"]), +] + +folder_mapping_list = { + "pretrained_v1/": "programs/applio_code/rvc/models/pretraineds/pretrained_v1/", + "pretrained_v2/": "programs/applio_code/rvc/models/pretraineds/pretrained_v2/", + "embedders/contentvec/": "programs/applio_code/rvc/models/embedders/contentvec/", + "predictors/": "programs/applio_code/rvc/models/predictors/", + "formant/": "programs/applio_code/rvc/models/formant/", +} + + +def get_file_size_if_missing(file_list): + """ + Calculate the total size of files to be downloaded only if they do not exist locally. + """ + total_size = 0 + for remote_folder, files in file_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in files: + destination_path = os.path.join(local_folder, file) + if not os.path.exists(destination_path): + url = f"{url_base}/{remote_folder}{file}" + response = requests.head(url) + total_size += int(response.headers.get("content-length", 0)) + return total_size + + +def download_file(url, destination_path, global_bar): + """ + Download a file from the given URL to the specified destination path, + updating the global progress bar as data is downloaded. + """ + + dir_name = os.path.dirname(destination_path) + if dir_name: + os.makedirs(dir_name, exist_ok=True) + response = requests.get(url, stream=True) + block_size = 1024 + with open(destination_path, "wb") as file: + for data in response.iter_content(block_size): + file.write(data) + global_bar.update(len(data)) + + +def download_mapping_files(file_mapping_list, global_bar): + """ + Download all files in the provided file mapping list using a thread pool executor, + and update the global progress bar as downloads progress. + """ + with ThreadPoolExecutor() as executor: + futures = [] + for remote_folder, file_list in file_mapping_list: + local_folder = folder_mapping_list.get(remote_folder, "") + for file in file_list: + destination_path = os.path.join(local_folder, file) + if not os.path.exists(destination_path): + url = f"{url_base}/{remote_folder}{file}" + futures.append( + executor.submit( + download_file, url, destination_path, global_bar + ) + ) + for future in futures: + future.result() + + +def calculate_total_size(pretraineds_v1, pretraineds_v2, models, exe): + """ + Calculate the total size of all files to be downloaded based on selected categories. + """ + total_size = 0 + if models: + total_size += get_file_size_if_missing(models_list) + total_size += get_file_size_if_missing(embedders_list) + if exe: + total_size += get_file_size_if_missing( + executables_list if os.name == "nt" else linux_executables_list + ) + if pretraineds_v1: + total_size += get_file_size_if_missing(pretraineds_v1_list) + if pretraineds_v2: + total_size += get_file_size_if_missing(pretraineds_v2_list) + return total_size + + +def prequisites_download_pipeline(pretraineds_v1, pretraineds_v2, models, exe): + """ + Manage the download pipeline for different categories of files. + """ + total_size = calculate_total_size(pretraineds_v1, pretraineds_v2, models, exe) + + if total_size > 0: + with tqdm( + total=total_size, unit="iB", unit_scale=True, desc="Downloading all files" + ) as global_bar: + if models: + download_mapping_files(models_list, global_bar) + download_mapping_files(embedders_list, global_bar) + if exe: + download_mapping_files( + executables_list if os.name == "nt" else linux_executables_list, + global_bar, + ) + if pretraineds_v1: + download_mapping_files(pretraineds_v1_list, global_bar) + if pretraineds_v2: + download_mapping_files(pretraineds_v2_list, global_bar) + else: + pass + + +if __name__ == "__main__": + prequisites_download_pipeline(False, False, True, False) diff --git a/programs/applio_code/rvc/lib/tools/pretrained_selector.py b/programs/applio_code/rvc/lib/tools/pretrained_selector.py new file mode 100644 index 0000000000000000000000000000000000000000..e982fac5078df43b3477f3e3d478a76ff85488c5 --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/pretrained_selector.py @@ -0,0 +1,63 @@ +def pretrained_selector(pitch_guidance): + if pitch_guidance == True: + return { + "v1": { + 32000: ( + "rvc/models/pretraineds/pretrained_v1/f0G32k.pth", + "rvc/models/pretraineds/pretrained_v1/f0D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v1/f0G40k.pth", + "rvc/models/pretraineds/pretrained_v1/f0D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v1/f0G48k.pth", + "rvc/models/pretraineds/pretrained_v1/f0D48k.pth", + ), + }, + "v2": { + 32000: ( + "rvc/models/pretraineds/pretrained_v2/f0G32k.pth", + "rvc/models/pretraineds/pretrained_v2/f0D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v2/f0G40k.pth", + "rvc/models/pretraineds/pretrained_v2/f0D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v2/f0G48k.pth", + "rvc/models/pretraineds/pretrained_v2/f0D48k.pth", + ), + }, + } + elif pitch_guidance == False: + return { + "v1": { + 32000: ( + "rvc/models/pretraineds/pretrained_v1/G32k.pth", + "rvc/models/pretraineds/pretrained_v1/D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v1/G40k.pth", + "rvc/models/pretraineds/pretrained_v1/D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v1/G48k.pth", + "rvc/models/pretraineds/pretrained_v1/D48k.pth", + ), + }, + "v2": { + 32000: ( + "rvc/models/pretraineds/pretrained_v2/G32k.pth", + "rvc/models/pretraineds/pretrained_v2/D32k.pth", + ), + 40000: ( + "rvc/models/pretraineds/pretrained_v2/G40k.pth", + "rvc/models/pretraineds/pretrained_v2/D40k.pth", + ), + 48000: ( + "rvc/models/pretraineds/pretrained_v2/G48k.pth", + "rvc/models/pretraineds/pretrained_v2/D48k.pth", + ), + }, + } diff --git a/programs/applio_code/rvc/lib/tools/split_audio.py b/programs/applio_code/rvc/lib/tools/split_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..e2b2a9f44f43bb7439a186cf20ea5f66f324de9a --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/split_audio.py @@ -0,0 +1,107 @@ +from pydub.silence import detect_nonsilent +from pydub import AudioSegment +import numpy as np +import re +import os + +from programs.applio_code.rvc.lib.utils import format_title + + +def process_audio(file_path): + try: + # load audio file + song = AudioSegment.from_file(file_path) + + # set silence threshold and duration + silence_thresh = -70 # dB + min_silence_len = 750 # ms, adjust as needed + + # detect nonsilent parts + nonsilent_parts = detect_nonsilent( + song, min_silence_len=min_silence_len, silence_thresh=silence_thresh + ) + + # Create a new directory to store chunks + file_dir = os.path.dirname(file_path) + file_name = os.path.basename(file_path).split(".")[0] + file_name = format_title(file_name) + new_dir_path = os.path.join(file_dir, file_name) + os.makedirs(new_dir_path, exist_ok=True) + + # Check if timestamps file exists, if so delete it + timestamps_file = os.path.join(file_dir, f"{file_name}_timestamps.txt") + if os.path.isfile(timestamps_file): + os.remove(timestamps_file) + + # export chunks and save start times + segment_count = 0 + for i, (start_i, end_i) in enumerate(nonsilent_parts): + chunk = song[start_i:end_i] + chunk_file_path = os.path.join(new_dir_path, f"chunk{i}.wav") + chunk.export(chunk_file_path, format="wav") + + print(f"Segment {i} created!") + segment_count += 1 + + # write start times to file + with open(timestamps_file, "a", encoding="utf-8") as f: + f.write(f"{chunk_file_path} starts at {start_i} ms\n") + + print(f"Total segments created: {segment_count}") + print(f"Split all chunks for {file_path} successfully!") + + return "Finish", new_dir_path + + except Exception as error: + print(f"An error occurred splitting the audio: {error}") + return "Error", None + + +def merge_audio(timestamps_file): + try: + # Extract prefix from the timestamps filename + prefix = os.path.basename(timestamps_file).replace("_timestamps.txt", "") + timestamps_dir = os.path.dirname(timestamps_file) + + # Open the timestamps file + with open(timestamps_file, "r", encoding="utf-8") as f: + lines = f.readlines() + + # Initialize empty list to hold audio segments + audio_segments = [] + last_end_time = 0 + + print(f"Processing file: {timestamps_file}") + + for line in lines: + # Extract filename and start time from line + match = re.search(r"(chunk\d+.wav) starts at (\d+) ms", line) + if match: + filename, start_time = match.groups() + start_time = int(start_time) + + # Construct the complete path to the chunk file + chunk_file = os.path.join(timestamps_dir, prefix, filename) + + # Add silence from last_end_time to start_time + silence_duration = max(start_time - last_end_time, 0) + silence = AudioSegment.silent(duration=silence_duration) + audio_segments.append(silence) + + # Load audio file and append to list + audio = AudioSegment.from_wav(chunk_file) + audio_segments.append(audio) + + # Update last_end_time + last_end_time = start_time + len(audio) + + print(f"Processed chunk: {chunk_file}") + + # Concatenate all audio_segments and export + merged_audio = sum(audio_segments) + merged_audio_np = np.array(merged_audio.get_array_of_samples()) + # print(f"Exported merged file: {merged_filename}\n") + return merged_audio.frame_rate, merged_audio_np + + except Exception as error: + print(f"An error occurred splitting the audio: {error}") diff --git a/programs/applio_code/rvc/lib/tools/tts.py b/programs/applio_code/rvc/lib/tools/tts.py new file mode 100644 index 0000000000000000000000000000000000000000..a9994dbd0db708581a6fc1fc53a8b24261711d69 --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/tts.py @@ -0,0 +1,20 @@ +import sys +import asyncio +import edge_tts + + +async def main(): + # Parse command line arguments + text = str(sys.argv[1]) + voice = str(sys.argv[2]) + rate = int(sys.argv[3]) + output_file = str(sys.argv[4]) + + rates = f"+{rate}%" if rate >= 0 else f"{rate}%" + + await edge_tts.Communicate(text, voice, rate=rates).save(output_file) + print(f"TTS with {voice} completed. Output TTS file: '{output_file}'") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/programs/applio_code/rvc/lib/tools/tts_voices.json b/programs/applio_code/rvc/lib/tools/tts_voices.json new file mode 100644 index 0000000000000000000000000000000000000000..b76cf447ccfacff86e844360caeac6c8e0b27e95 --- /dev/null +++ b/programs/applio_code/rvc/lib/tools/tts_voices.json @@ -0,0 +1,5748 @@ +[ + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, AdriNeural)", + "ShortName": "af-ZA-AdriNeural", + "Gender": "Female", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Adri Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (af-ZA, WillemNeural)", + "ShortName": "af-ZA-WillemNeural", + "Gender": "Male", + "Locale": "af-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Willem Online (Natural) - Afrikaans (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, AnilaNeural)", + "ShortName": "sq-AL-AnilaNeural", + "Gender": "Female", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anila Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sq-AL, IlirNeural)", + "ShortName": "sq-AL-IlirNeural", + "Gender": "Male", + "Locale": "sq-AL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ilir Online (Natural) - Albanian (Albania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, AmehaNeural)", + "ShortName": "am-ET-AmehaNeural", + "Gender": "Male", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ameha Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (am-ET, MekdesNeural)", + "ShortName": "am-ET-MekdesNeural", + "Gender": "Female", + "Locale": "am-ET", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mekdes Online (Natural) - Amharic (Ethiopia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, AminaNeural)", + "ShortName": "ar-DZ-AminaNeural", + "Gender": "Female", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amina Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-DZ, IsmaelNeural)", + "ShortName": "ar-DZ-IsmaelNeural", + "Gender": "Male", + "Locale": "ar-DZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ismael Online (Natural) - Arabic (Algeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, AliNeural)", + "ShortName": "ar-BH-AliNeural", + "Gender": "Male", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ali Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-BH, LailaNeural)", + "ShortName": "ar-BH-LailaNeural", + "Gender": "Female", + "Locale": "ar-BH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laila Online (Natural) - Arabic (Bahrain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, SalmaNeural)", + "ShortName": "ar-EG-SalmaNeural", + "Gender": "Female", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salma Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-EG, ShakirNeural)", + "ShortName": "ar-EG-ShakirNeural", + "Gender": "Male", + "Locale": "ar-EG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shakir Online (Natural) - Arabic (Egypt)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, BasselNeural)", + "ShortName": "ar-IQ-BasselNeural", + "Gender": "Male", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bassel Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-IQ, RanaNeural)", + "ShortName": "ar-IQ-RanaNeural", + "Gender": "Female", + "Locale": "ar-IQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rana Online (Natural) - Arabic (Iraq)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, SanaNeural)", + "ShortName": "ar-JO-SanaNeural", + "Gender": "Female", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sana Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-JO, TaimNeural)", + "ShortName": "ar-JO-TaimNeural", + "Gender": "Male", + "Locale": "ar-JO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Taim Online (Natural) - Arabic (Jordan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, FahedNeural)", + "ShortName": "ar-KW-FahedNeural", + "Gender": "Male", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fahed Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-KW, NouraNeural)", + "ShortName": "ar-KW-NouraNeural", + "Gender": "Female", + "Locale": "ar-KW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noura Online (Natural) - Arabic (Kuwait)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, LaylaNeural)", + "ShortName": "ar-LB-LaylaNeural", + "Gender": "Female", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Layla Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LB, RamiNeural)", + "ShortName": "ar-LB-RamiNeural", + "Gender": "Male", + "Locale": "ar-LB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rami Online (Natural) - Arabic (Lebanon)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, ImanNeural)", + "ShortName": "ar-LY-ImanNeural", + "Gender": "Female", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Iman Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-LY, OmarNeural)", + "ShortName": "ar-LY-OmarNeural", + "Gender": "Male", + "Locale": "ar-LY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Omar Online (Natural) - Arabic (Libya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, JamalNeural)", + "ShortName": "ar-MA-JamalNeural", + "Gender": "Male", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jamal Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-MA, MounaNeural)", + "ShortName": "ar-MA-MounaNeural", + "Gender": "Female", + "Locale": "ar-MA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mouna Online (Natural) - Arabic (Morocco)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AbdullahNeural)", + "ShortName": "ar-OM-AbdullahNeural", + "Gender": "Male", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abdullah Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-OM, AyshaNeural)", + "ShortName": "ar-OM-AyshaNeural", + "Gender": "Female", + "Locale": "ar-OM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aysha Online (Natural) - Arabic (Oman)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, AmalNeural)", + "ShortName": "ar-QA-AmalNeural", + "Gender": "Female", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amal Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-QA, MoazNeural)", + "ShortName": "ar-QA-MoazNeural", + "Gender": "Male", + "Locale": "ar-QA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Moaz Online (Natural) - Arabic (Qatar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, HamedNeural)", + "ShortName": "ar-SA-HamedNeural", + "Gender": "Male", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamed Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SA, ZariyahNeural)", + "ShortName": "ar-SA-ZariyahNeural", + "Gender": "Female", + "Locale": "ar-SA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zariyah Online (Natural) - Arabic (Saudi Arabia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, AmanyNeural)", + "ShortName": "ar-SY-AmanyNeural", + "Gender": "Female", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amany Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-SY, LaithNeural)", + "ShortName": "ar-SY-LaithNeural", + "Gender": "Male", + "Locale": "ar-SY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Laith Online (Natural) - Arabic (Syria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, HediNeural)", + "ShortName": "ar-TN-HediNeural", + "Gender": "Male", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hedi Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-TN, ReemNeural)", + "ShortName": "ar-TN-ReemNeural", + "Gender": "Female", + "Locale": "ar-TN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Reem Online (Natural) - Arabic (Tunisia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, FatimaNeural)", + "ShortName": "ar-AE-FatimaNeural", + "Gender": "Female", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fatima Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-AE, HamdanNeural)", + "ShortName": "ar-AE-HamdanNeural", + "Gender": "Male", + "Locale": "ar-AE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hamdan Online (Natural) - Arabic (United Arab Emirates)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, MaryamNeural)", + "ShortName": "ar-YE-MaryamNeural", + "Gender": "Female", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maryam Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ar-YE, SalehNeural)", + "ShortName": "ar-YE-SalehNeural", + "Gender": "Male", + "Locale": "ar-YE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saleh Online (Natural) - Arabic (Yemen)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BabekNeural)", + "ShortName": "az-AZ-BabekNeural", + "Gender": "Male", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Babek Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (az-AZ, BanuNeural)", + "ShortName": "az-AZ-BanuNeural", + "Gender": "Female", + "Locale": "az-AZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Banu Online (Natural) - Azerbaijani (Azerbaijan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, NabanitaNeural)", + "ShortName": "bn-BD-NabanitaNeural", + "Gender": "Female", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nabanita Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-BD, PradeepNeural)", + "ShortName": "bn-BD-PradeepNeural", + "Gender": "Male", + "Locale": "bn-BD", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pradeep Online (Natural) - Bangla (Bangladesh)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, BashkarNeural)", + "ShortName": "bn-IN-BashkarNeural", + "Gender": "Male", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bashkar Online (Natural) - Bangla (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bn-IN, TanishaaNeural)", + "ShortName": "bn-IN-TanishaaNeural", + "Gender": "Female", + "Locale": "bn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tanishaa Online (Natural) - Bengali (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, GoranNeural)", + "ShortName": "bs-BA-GoranNeural", + "Gender": "Male", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Goran Online (Natural) - Bosnian (Bosnia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bs-BA, VesnaNeural)", + "ShortName": "bs-BA-VesnaNeural", + "Gender": "Female", + "Locale": "bs-BA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vesna Online (Natural) - Bosnian (Bosnia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, BorislavNeural)", + "ShortName": "bg-BG-BorislavNeural", + "Gender": "Male", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Borislav Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (bg-BG, KalinaNeural)", + "ShortName": "bg-BG-KalinaNeural", + "Gender": "Female", + "Locale": "bg-BG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kalina Online (Natural) - Bulgarian (Bulgaria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, NilarNeural)", + "ShortName": "my-MM-NilarNeural", + "Gender": "Female", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nilar Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (my-MM, ThihaNeural)", + "ShortName": "my-MM-ThihaNeural", + "Gender": "Male", + "Locale": "my-MM", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thiha Online (Natural) - Burmese (Myanmar)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, EnricNeural)", + "ShortName": "ca-ES-EnricNeural", + "Gender": "Male", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Enric Online (Natural) - Catalan (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ca-ES, JoanaNeural)", + "ShortName": "ca-ES-JoanaNeural", + "Gender": "Female", + "Locale": "ca-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joana Online (Natural) - Catalan (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuGaaiNeural)", + "ShortName": "zh-HK-HiuGaaiNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuGaai Online (Natural) - Chinese (Cantonese Traditional)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, HiuMaanNeural)", + "ShortName": "zh-HK-HiuMaanNeural", + "Gender": "Female", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HiuMaan Online (Natural) - Chinese (Hong Kong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-HK, WanLungNeural)", + "ShortName": "zh-HK-WanLungNeural", + "Gender": "Male", + "Locale": "zh-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft WanLung Online (Natural) - Chinese (Hong Kong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoxiaoNeural)", + "ShortName": "zh-CN-XiaoxiaoNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoxiao Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Warm" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, XiaoyiNeural)", + "ShortName": "zh-CN-XiaoyiNeural", + "Gender": "Female", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoyi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunjianNeural)", + "ShortName": "zh-CN-YunjianNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunjian Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Sports", + " Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiNeural)", + "ShortName": "zh-CN-YunxiNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxi Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Novel" + ], + "VoicePersonalities": [ + "Lively", + "Sunshine" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunxiaNeural)", + "ShortName": "zh-CN-YunxiaNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunxia Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Novel" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN, YunyangNeural)", + "ShortName": "zh-CN-YunyangNeural", + "Gender": "Male", + "Locale": "zh-CN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yunyang Online (Natural) - Chinese (Mainland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News" + ], + "VoicePersonalities": [ + "Professional", + "Reliable" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-liaoning, XiaobeiNeural)", + "ShortName": "zh-CN-liaoning-XiaobeiNeural", + "Gender": "Female", + "Locale": "zh-CN-liaoning", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaobei Online (Natural) - Chinese (Northeastern Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Humorous" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoChenNeural)", + "ShortName": "zh-TW-HsiaoChenNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoChen Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, YunJheNeural)", + "ShortName": "zh-TW-YunJheNeural", + "Gender": "Male", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft YunJhe Online (Natural) - Chinese (Taiwan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-TW, HsiaoYuNeural)", + "ShortName": "zh-TW-HsiaoYuNeural", + "Gender": "Female", + "Locale": "zh-TW", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HsiaoYu Online (Natural) - Chinese (Taiwanese Mandarin)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zh-CN-shaanxi, XiaoniNeural)", + "ShortName": "zh-CN-shaanxi-XiaoniNeural", + "Gender": "Female", + "Locale": "zh-CN-shaanxi", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Xiaoni Online (Natural) - Chinese (Zhongyuan Mandarin Shaanxi)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Dialect" + ], + "VoicePersonalities": [ + "Bright" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, GabrijelaNeural)", + "ShortName": "hr-HR-GabrijelaNeural", + "Gender": "Female", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gabrijela Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hr-HR, SreckoNeural)", + "ShortName": "hr-HR-SreckoNeural", + "Gender": "Male", + "Locale": "hr-HR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Srecko Online (Natural) - Croatian (Croatia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, AntoninNeural)", + "ShortName": "cs-CZ-AntoninNeural", + "Gender": "Male", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonin Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cs-CZ, VlastaNeural)", + "ShortName": "cs-CZ-VlastaNeural", + "Gender": "Female", + "Locale": "cs-CZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Vlasta Online (Natural) - Czech (Czech)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, ChristelNeural)", + "ShortName": "da-DK-ChristelNeural", + "Gender": "Female", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christel Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (da-DK, JeppeNeural)", + "ShortName": "da-DK-JeppeNeural", + "Gender": "Male", + "Locale": "da-DK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jeppe Online (Natural) - Danish (Denmark)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, ArnaudNeural)", + "ShortName": "nl-BE-ArnaudNeural", + "Gender": "Male", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Arnaud Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-BE, DenaNeural)", + "ShortName": "nl-BE-DenaNeural", + "Gender": "Female", + "Locale": "nl-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dena Online (Natural) - Dutch (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, ColetteNeural)", + "ShortName": "nl-NL-ColetteNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colette Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, FennaNeural)", + "ShortName": "nl-NL-FennaNeural", + "Gender": "Female", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fenna Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nl-NL, MaartenNeural)", + "ShortName": "nl-NL-MaartenNeural", + "Gender": "Male", + "Locale": "nl-NL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maarten Online (Natural) - Dutch (Netherlands)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, NatashaNeural)", + "ShortName": "en-AU-NatashaNeural", + "Gender": "Female", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Natasha Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-AU, WilliamNeural)", + "ShortName": "en-AU-WilliamNeural", + "Gender": "Male", + "Locale": "en-AU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft William Online (Natural) - English (Australia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, ClaraNeural)", + "ShortName": "en-CA-ClaraNeural", + "Gender": "Female", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Clara Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-CA, LiamNeural)", + "ShortName": "en-CA-LiamNeural", + "Gender": "Male", + "Locale": "en-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Liam Online (Natural) - English (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, SamNeural)", + "ShortName": "en-HK-SamNeural", + "Gender": "Male", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sam Online (Natural) - English (Hongkong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-HK, YanNeural)", + "ShortName": "en-HK-YanNeural", + "Gender": "Female", + "Locale": "en-HK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yan Online (Natural) - English (Hongkong)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaExpressiveNeural)", + "ShortName": "en-IN-NeerjaExpressiveNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India) (Preview)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, NeerjaNeural)", + "ShortName": "en-IN-NeerjaNeural", + "Gender": "Female", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Neerja Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IN, PrabhatNeural)", + "ShortName": "en-IN-PrabhatNeural", + "Gender": "Male", + "Locale": "en-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Prabhat Online (Natural) - English (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, ConnorNeural)", + "ShortName": "en-IE-ConnorNeural", + "Gender": "Male", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Connor Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-IE, EmilyNeural)", + "ShortName": "en-IE-EmilyNeural", + "Gender": "Female", + "Locale": "en-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emily Online (Natural) - English (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, AsiliaNeural)", + "ShortName": "en-KE-AsiliaNeural", + "Gender": "Female", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asilia Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-KE, ChilembaNeural)", + "ShortName": "en-KE-ChilembaNeural", + "Gender": "Male", + "Locale": "en-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chilemba Online (Natural) - English (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MitchellNeural)", + "ShortName": "en-NZ-MitchellNeural", + "Gender": "Male", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mitchell Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NZ, MollyNeural)", + "ShortName": "en-NZ-MollyNeural", + "Gender": "Female", + "Locale": "en-NZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Molly Online (Natural) - English (New Zealand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, AbeoNeural)", + "ShortName": "en-NG-AbeoNeural", + "Gender": "Male", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Abeo Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-NG, EzinneNeural)", + "ShortName": "en-NG-EzinneNeural", + "Gender": "Female", + "Locale": "en-NG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ezinne Online (Natural) - English (Nigeria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, JamesNeural)", + "ShortName": "en-PH-JamesNeural", + "Gender": "Male", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft James Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-PH, RosaNeural)", + "ShortName": "en-PH-RosaNeural", + "Gender": "Female", + "Locale": "en-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rosa Online (Natural) - English (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, LunaNeural)", + "ShortName": "en-SG-LunaNeural", + "Gender": "Female", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luna Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-SG, WayneNeural)", + "ShortName": "en-SG-WayneNeural", + "Gender": "Male", + "Locale": "en-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Wayne Online (Natural) - English (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LeahNeural)", + "ShortName": "en-ZA-LeahNeural", + "Gender": "Female", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leah Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-ZA, LukeNeural)", + "ShortName": "en-ZA-LukeNeural", + "Gender": "Male", + "Locale": "en-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luke Online (Natural) - English (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ElimuNeural)", + "ShortName": "en-TZ-ElimuNeural", + "Gender": "Male", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elimu Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-TZ, ImaniNeural)", + "ShortName": "en-TZ-ImaniNeural", + "Gender": "Female", + "Locale": "en-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Imani Online (Natural) - English (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, LibbyNeural)", + "ShortName": "en-GB-LibbyNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Libby Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, MaisieNeural)", + "ShortName": "en-GB-MaisieNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maisie Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, RyanNeural)", + "ShortName": "en-GB-RyanNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ryan Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, SoniaNeural)", + "ShortName": "en-GB-SoniaNeural", + "Gender": "Female", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sonia Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-GB, ThomasNeural)", + "ShortName": "en-GB-ThomasNeural", + "Gender": "Male", + "Locale": "en-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thomas Online (Natural) - English (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaMultilingualNeural)", + "ShortName": "en-US-AvaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AvaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewMultilingualNeural)", + "ShortName": "en-US-AndrewMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft AndrewMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", + "ShortName": "en-US-EmmaMultilingualNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft EmmaMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianMultilingualNeural)", + "ShortName": "en-US-BrianMultilingualNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft BrianMultilingual Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AvaNeural)", + "ShortName": "en-US-AvaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ava Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Expressive", + "Caring", + "Pleasant", + "Friendly" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AndrewNeural)", + "ShortName": "en-US-AndrewNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrew Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Warm", + "Confident", + "Authentic", + "Honest" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EmmaNeural)", + "ShortName": "en-US-EmmaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emma Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Cheerful", + "Clear", + "Conversational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, BrianNeural)", + "ShortName": "en-US-BrianNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Brian Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Conversation", + "Copilot" + ], + "VoicePersonalities": [ + "Approachable", + "Casual", + "Sincere" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AnaNeural)", + "ShortName": "en-US-AnaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ana Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "Cartoon", + "Conversation" + ], + "VoicePersonalities": [ + "Cute" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)", + "ShortName": "en-US-AriaNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aria Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Positive", + "Confident" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, ChristopherNeural)", + "ShortName": "en-US-ChristopherNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Christopher Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Reliable", + "Authority" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, EricNeural)", + "ShortName": "en-US-EricNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eric Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)", + "ShortName": "en-US-GuyNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Guy Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Passion" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, JennyNeural)", + "ShortName": "en-US-JennyNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jenny Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Considerate", + "Comfort" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, MichelleNeural)", + "ShortName": "en-US-MichelleNeural", + "Gender": "Female", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Michelle Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Friendly", + "Pleasant" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, RogerNeural)", + "ShortName": "en-US-RogerNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roger Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Lively" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (en-US, SteffanNeural)", + "ShortName": "en-US-SteffanNeural", + "Gender": "Male", + "Locale": "en-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Steffan Online (Natural) - English (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "News", + "Novel" + ], + "VoicePersonalities": [ + "Rational" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, AnuNeural)", + "ShortName": "et-EE-AnuNeural", + "Gender": "Female", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anu Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (et-EE, KertNeural)", + "ShortName": "et-EE-KertNeural", + "Gender": "Male", + "Locale": "et-EE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kert Online (Natural) - Estonian (Estonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, AngeloNeural)", + "ShortName": "fil-PH-AngeloNeural", + "Gender": "Male", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Angelo Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fil-PH, BlessicaNeural)", + "ShortName": "fil-PH-BlessicaNeural", + "Gender": "Female", + "Locale": "fil-PH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Blessica Online (Natural) - Filipino (Philippines)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, HarriNeural)", + "ShortName": "fi-FI-HarriNeural", + "Gender": "Male", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Harri Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fi-FI, NooraNeural)", + "ShortName": "fi-FI-NooraNeural", + "Gender": "Female", + "Locale": "fi-FI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noora Online (Natural) - Finnish (Finland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, CharlineNeural)", + "ShortName": "fr-BE-CharlineNeural", + "Gender": "Female", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Charline Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-BE, GerardNeural)", + "ShortName": "fr-BE-GerardNeural", + "Gender": "Male", + "Locale": "fr-BE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gerard Online (Natural) - French (Belgium)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, ThierryNeural)", + "ShortName": "fr-CA-ThierryNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thierry Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, AntoineNeural)", + "ShortName": "fr-CA-AntoineNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antoine Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, JeanNeural)", + "ShortName": "fr-CA-JeanNeural", + "Gender": "Male", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jean Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CA, SylvieNeural)", + "ShortName": "fr-CA-SylvieNeural", + "Gender": "Female", + "Locale": "fr-CA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sylvie Online (Natural) - French (Canada)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, VivienneMultilingualNeural)", + "ShortName": "fr-FR-VivienneMultilingualNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft VivienneMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, RemyMultilingualNeural)", + "ShortName": "fr-FR-RemyMultilingualNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft RemyMultilingual Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, DeniseNeural)", + "ShortName": "fr-FR-DeniseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Denise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, EloiseNeural)", + "ShortName": "fr-FR-EloiseNeural", + "Gender": "Female", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eloise Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-FR, HenriNeural)", + "ShortName": "fr-FR-HenriNeural", + "Gender": "Male", + "Locale": "fr-FR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Henri Online (Natural) - French (France)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, ArianeNeural)", + "ShortName": "fr-CH-ArianeNeural", + "Gender": "Female", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ariane Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fr-CH, FabriceNeural)", + "ShortName": "fr-CH-FabriceNeural", + "Gender": "Male", + "Locale": "fr-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Fabrice Online (Natural) - French (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, RoiNeural)", + "ShortName": "gl-ES-RoiNeural", + "Gender": "Male", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roi Online (Natural) - Galician (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gl-ES, SabelaNeural)", + "ShortName": "gl-ES-SabelaNeural", + "Gender": "Female", + "Locale": "gl-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sabela Online (Natural) - Galician (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, EkaNeural)", + "ShortName": "ka-GE-EkaNeural", + "Gender": "Female", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Eka Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ka-GE, GiorgiNeural)", + "ShortName": "ka-GE-GiorgiNeural", + "Gender": "Male", + "Locale": "ka-GE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Giorgi Online (Natural) - Georgian (Georgia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, IngridNeural)", + "ShortName": "de-AT-IngridNeural", + "Gender": "Female", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ingrid Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-AT, JonasNeural)", + "ShortName": "de-AT-JonasNeural", + "Gender": "Male", + "Locale": "de-AT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jonas Online (Natural) - German (Austria)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, SeraphinaMultilingualNeural)", + "ShortName": "de-DE-SeraphinaMultilingualNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SeraphinaMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, FlorianMultilingualNeural)", + "ShortName": "de-DE-FlorianMultilingualNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft FlorianMultilingual Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, AmalaNeural)", + "ShortName": "de-DE-AmalaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Amala Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, ConradNeural)", + "ShortName": "de-DE-ConradNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Conrad Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KatjaNeural)", + "ShortName": "de-DE-KatjaNeural", + "Gender": "Female", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Katja Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-DE, KillianNeural)", + "ShortName": "de-DE-KillianNeural", + "Gender": "Male", + "Locale": "de-DE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Killian Online (Natural) - German (Germany)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, JanNeural)", + "ShortName": "de-CH-JanNeural", + "Gender": "Male", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jan Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (de-CH, LeniNeural)", + "ShortName": "de-CH-LeniNeural", + "Gender": "Female", + "Locale": "de-CH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leni Online (Natural) - German (Switzerland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, AthinaNeural)", + "ShortName": "el-GR-AthinaNeural", + "Gender": "Female", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Athina Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (el-GR, NestorasNeural)", + "ShortName": "el-GR-NestorasNeural", + "Gender": "Male", + "Locale": "el-GR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nestoras Online (Natural) - Greek (Greece)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, DhwaniNeural)", + "ShortName": "gu-IN-DhwaniNeural", + "Gender": "Female", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dhwani Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (gu-IN, NiranjanNeural)", + "ShortName": "gu-IN-NiranjanNeural", + "Gender": "Male", + "Locale": "gu-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niranjan Online (Natural) - Gujarati (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, AvriNeural)", + "ShortName": "he-IL-AvriNeural", + "Gender": "Male", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Avri Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (he-IL, HilaNeural)", + "ShortName": "he-IL-HilaNeural", + "Gender": "Female", + "Locale": "he-IL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hila Online (Natural) - Hebrew (Israel)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, MadhurNeural)", + "ShortName": "hi-IN-MadhurNeural", + "Gender": "Male", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madhur Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hi-IN, SwaraNeural)", + "ShortName": "hi-IN-SwaraNeural", + "Gender": "Female", + "Locale": "hi-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Swara Online (Natural) - Hindi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, NoemiNeural)", + "ShortName": "hu-HU-NoemiNeural", + "Gender": "Female", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Noemi Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (hu-HU, TamasNeural)", + "ShortName": "hu-HU-TamasNeural", + "Gender": "Male", + "Locale": "hu-HU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tamas Online (Natural) - Hungarian (Hungary)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GudrunNeural)", + "ShortName": "is-IS-GudrunNeural", + "Gender": "Female", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gudrun Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (is-IS, GunnarNeural)", + "ShortName": "is-IS-GunnarNeural", + "Gender": "Male", + "Locale": "is-IS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gunnar Online (Natural) - Icelandic (Iceland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, ArdiNeural)", + "ShortName": "id-ID-ArdiNeural", + "Gender": "Male", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ardi Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (id-ID, GadisNeural)", + "ShortName": "id-ID-GadisNeural", + "Gender": "Female", + "Locale": "id-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gadis Online (Natural) - Indonesian (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, ColmNeural)", + "ShortName": "ga-IE-ColmNeural", + "Gender": "Male", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Colm Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ga-IE, OrlaNeural)", + "ShortName": "ga-IE-OrlaNeural", + "Gender": "Female", + "Locale": "ga-IE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Orla Online (Natural) - Irish (Ireland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, GiuseppeNeural)", + "ShortName": "it-IT-GiuseppeNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Giuseppe Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, DiegoNeural)", + "ShortName": "it-IT-DiegoNeural", + "Gender": "Male", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Diego Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, ElsaNeural)", + "ShortName": "it-IT-ElsaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elsa Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (it-IT, IsabellaNeural)", + "ShortName": "it-IT-IsabellaNeural", + "Gender": "Female", + "Locale": "it-IT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Isabella Online (Natural) - Italian (Italy)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, KeitaNeural)", + "ShortName": "ja-JP-KeitaNeural", + "Gender": "Male", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keita Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ja-JP, NanamiNeural)", + "ShortName": "ja-JP-NanamiNeural", + "Gender": "Female", + "Locale": "ja-JP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nanami Online (Natural) - Japanese (Japan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, DimasNeural)", + "ShortName": "jv-ID-DimasNeural", + "Gender": "Male", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dimas Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (jv-ID, SitiNeural)", + "ShortName": "jv-ID-SitiNeural", + "Gender": "Female", + "Locale": "jv-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Siti Online (Natural) - Javanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, GaganNeural)", + "ShortName": "kn-IN-GaganNeural", + "Gender": "Male", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gagan Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kn-IN, SapnaNeural)", + "ShortName": "kn-IN-SapnaNeural", + "Gender": "Female", + "Locale": "kn-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sapna Online (Natural) - Kannada (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, AigulNeural)", + "ShortName": "kk-KZ-AigulNeural", + "Gender": "Female", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aigul Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (kk-KZ, DauletNeural)", + "ShortName": "kk-KZ-DauletNeural", + "Gender": "Male", + "Locale": "kk-KZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daulet Online (Natural) - Kazakh (Kazakhstan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, PisethNeural)", + "ShortName": "km-KH-PisethNeural", + "Gender": "Male", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Piseth Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (km-KH, SreymomNeural)", + "ShortName": "km-KH-SreymomNeural", + "Gender": "Female", + "Locale": "km-KH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sreymom Online (Natural) - Khmer (Cambodia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, HyunsuNeural)", + "ShortName": "ko-KR-HyunsuNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hyunsu Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, InJoonNeural)", + "ShortName": "ko-KR-InJoonNeural", + "Gender": "Male", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft InJoon Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ko-KR, SunHiNeural)", + "ShortName": "ko-KR-SunHiNeural", + "Gender": "Female", + "Locale": "ko-KR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft SunHi Online (Natural) - Korean (Korea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, ChanthavongNeural)", + "ShortName": "lo-LA-ChanthavongNeural", + "Gender": "Male", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Chanthavong Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lo-LA, KeomanyNeural)", + "ShortName": "lo-LA-KeomanyNeural", + "Gender": "Female", + "Locale": "lo-LA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Keomany Online (Natural) - Lao (Laos)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, EveritaNeural)", + "ShortName": "lv-LV-EveritaNeural", + "Gender": "Female", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Everita Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lv-LV, NilsNeural)", + "ShortName": "lv-LV-NilsNeural", + "Gender": "Male", + "Locale": "lv-LV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nils Online (Natural) - Latvian (Latvia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, LeonasNeural)", + "ShortName": "lt-LT-LeonasNeural", + "Gender": "Male", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Leonas Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (lt-LT, OnaNeural)", + "ShortName": "lt-LT-OnaNeural", + "Gender": "Female", + "Locale": "lt-LT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ona Online (Natural) - Lithuanian (Lithuania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, AleksandarNeural)", + "ShortName": "mk-MK-AleksandarNeural", + "Gender": "Male", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aleksandar Online (Natural) - Macedonian (Republic of North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mk-MK, MarijaNeural)", + "ShortName": "mk-MK-MarijaNeural", + "Gender": "Female", + "Locale": "mk-MK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marija Online (Natural) - Macedonian (Republic of North Macedonia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, OsmanNeural)", + "ShortName": "ms-MY-OsmanNeural", + "Gender": "Male", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Osman Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ms-MY, YasminNeural)", + "ShortName": "ms-MY-YasminNeural", + "Gender": "Female", + "Locale": "ms-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yasmin Online (Natural) - Malay (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, MidhunNeural)", + "ShortName": "ml-IN-MidhunNeural", + "Gender": "Male", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Midhun Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ml-IN, SobhanaNeural)", + "ShortName": "ml-IN-SobhanaNeural", + "Gender": "Female", + "Locale": "ml-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sobhana Online (Natural) - Malayalam (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, GraceNeural)", + "ShortName": "mt-MT-GraceNeural", + "Gender": "Female", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Grace Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mt-MT, JosephNeural)", + "ShortName": "mt-MT-JosephNeural", + "Gender": "Male", + "Locale": "mt-MT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Joseph Online (Natural) - Maltese (Malta)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, AarohiNeural)", + "ShortName": "mr-IN-AarohiNeural", + "Gender": "Female", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aarohi Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mr-IN, ManoharNeural)", + "ShortName": "mr-IN-ManoharNeural", + "Gender": "Male", + "Locale": "mr-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manohar Online (Natural) - Marathi (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, BataaNeural)", + "ShortName": "mn-MN-BataaNeural", + "Gender": "Male", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Bataa Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (mn-MN, YesuiNeural)", + "ShortName": "mn-MN-YesuiNeural", + "Gender": "Female", + "Locale": "mn-MN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yesui Online (Natural) - Mongolian (Mongolia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, HemkalaNeural)", + "ShortName": "ne-NP-HemkalaNeural", + "Gender": "Female", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Hemkala Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ne-NP, SagarNeural)", + "ShortName": "ne-NP-SagarNeural", + "Gender": "Male", + "Locale": "ne-NP", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sagar Online (Natural) - Nepali (Nepal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, FinnNeural)", + "ShortName": "nb-NO-FinnNeural", + "Gender": "Male", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Finn Online (Natural) - Norwegian (Bokmål Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (nb-NO, PernilleNeural)", + "ShortName": "nb-NO-PernilleNeural", + "Gender": "Female", + "Locale": "nb-NO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pernille Online (Natural) - Norwegian (Bokmål, Norway)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, GulNawazNeural)", + "ShortName": "ps-AF-GulNawazNeural", + "Gender": "Male", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft GulNawaz Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ps-AF, LatifaNeural)", + "ShortName": "ps-AF-LatifaNeural", + "Gender": "Female", + "Locale": "ps-AF", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Latifa Online (Natural) - Pashto (Afghanistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, DilaraNeural)", + "ShortName": "fa-IR-DilaraNeural", + "Gender": "Female", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dilara Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (fa-IR, FaridNeural)", + "ShortName": "fa-IR-FaridNeural", + "Gender": "Male", + "Locale": "fa-IR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Farid Online (Natural) - Persian (Iran)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, MarekNeural)", + "ShortName": "pl-PL-MarekNeural", + "Gender": "Male", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marek Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pl-PL, ZofiaNeural)", + "ShortName": "pl-PL-ZofiaNeural", + "Gender": "Female", + "Locale": "pl-PL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zofia Online (Natural) - Polish (Poland)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, ThalitaNeural)", + "ShortName": "pt-BR-ThalitaNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thalita Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, AntonioNeural)", + "ShortName": "pt-BR-AntonioNeural", + "Gender": "Male", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Antonio Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-BR, FranciscaNeural)", + "ShortName": "pt-BR-FranciscaNeural", + "Gender": "Female", + "Locale": "pt-BR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Francisca Online (Natural) - Portuguese (Brazil)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, DuarteNeural)", + "ShortName": "pt-PT-DuarteNeural", + "Gender": "Male", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Duarte Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (pt-PT, RaquelNeural)", + "ShortName": "pt-PT-RaquelNeural", + "Gender": "Female", + "Locale": "pt-PT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Raquel Online (Natural) - Portuguese (Portugal)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, AlinaNeural)", + "ShortName": "ro-RO-AlinaNeural", + "Gender": "Female", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alina Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ro-RO, EmilNeural)", + "ShortName": "ro-RO-EmilNeural", + "Gender": "Male", + "Locale": "ro-RO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emil Online (Natural) - Romanian (Romania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, DmitryNeural)", + "ShortName": "ru-RU-DmitryNeural", + "Gender": "Male", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dmitry Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ru-RU, SvetlanaNeural)", + "ShortName": "ru-RU-SvetlanaNeural", + "Gender": "Female", + "Locale": "ru-RU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Svetlana Online (Natural) - Russian (Russia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, NicholasNeural)", + "ShortName": "sr-RS-NicholasNeural", + "Gender": "Male", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nicholas Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sr-RS, SophieNeural)", + "ShortName": "sr-RS-SophieNeural", + "Gender": "Female", + "Locale": "sr-RS", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sophie Online (Natural) - Serbian (Serbia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, SameeraNeural)", + "ShortName": "si-LK-SameeraNeural", + "Gender": "Male", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sameera Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (si-LK, ThiliniNeural)", + "ShortName": "si-LK-ThiliniNeural", + "Gender": "Female", + "Locale": "si-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thilini Online (Natural) - Sinhala (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, LukasNeural)", + "ShortName": "sk-SK-LukasNeural", + "Gender": "Male", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lukas Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sk-SK, ViktoriaNeural)", + "ShortName": "sk-SK-ViktoriaNeural", + "Gender": "Female", + "Locale": "sk-SK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Viktoria Online (Natural) - Slovak (Slovakia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, PetraNeural)", + "ShortName": "sl-SI-PetraNeural", + "Gender": "Female", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Petra Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sl-SI, RokNeural)", + "ShortName": "sl-SI-RokNeural", + "Gender": "Male", + "Locale": "sl-SI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rok Online (Natural) - Slovenian (Slovenia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, MuuseNeural)", + "ShortName": "so-SO-MuuseNeural", + "Gender": "Male", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Muuse Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (so-SO, UbaxNeural)", + "ShortName": "so-SO-UbaxNeural", + "Gender": "Female", + "Locale": "so-SO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ubax Online (Natural) - Somali (Somalia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, ElenaNeural)", + "ShortName": "es-AR-ElenaNeural", + "Gender": "Female", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elena Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-AR, TomasNeural)", + "ShortName": "es-AR-TomasNeural", + "Gender": "Male", + "Locale": "es-AR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tomas Online (Natural) - Spanish (Argentina)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, MarceloNeural)", + "ShortName": "es-BO-MarceloNeural", + "Gender": "Male", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marcelo Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-BO, SofiaNeural)", + "ShortName": "es-BO-SofiaNeural", + "Gender": "Female", + "Locale": "es-BO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofia Online (Natural) - Spanish (Bolivia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, CatalinaNeural)", + "ShortName": "es-CL-CatalinaNeural", + "Gender": "Female", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Catalina Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CL, LorenzoNeural)", + "ShortName": "es-CL-LorenzoNeural", + "Gender": "Male", + "Locale": "es-CL", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorenzo Online (Natural) - Spanish (Chile)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, XimenaNeural)", + "ShortName": "es-ES-XimenaNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ximena Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, GonzaloNeural)", + "ShortName": "es-CO-GonzaloNeural", + "Gender": "Male", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gonzalo Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CO, SalomeNeural)", + "ShortName": "es-CO-SalomeNeural", + "Gender": "Female", + "Locale": "es-CO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salome Online (Natural) - Spanish (Colombia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, JuanNeural)", + "ShortName": "es-CR-JuanNeural", + "Gender": "Male", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Juan Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CR, MariaNeural)", + "ShortName": "es-CR-MariaNeural", + "Gender": "Female", + "Locale": "es-CR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Maria Online (Natural) - Spanish (Costa Rica)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, BelkysNeural)", + "ShortName": "es-CU-BelkysNeural", + "Gender": "Female", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Belkys Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-CU, ManuelNeural)", + "ShortName": "es-CU-ManuelNeural", + "Gender": "Male", + "Locale": "es-CU", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Manuel Online (Natural) - Spanish (Cuba)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, EmilioNeural)", + "ShortName": "es-DO-EmilioNeural", + "Gender": "Male", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emilio Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-DO, RamonaNeural)", + "ShortName": "es-DO-RamonaNeural", + "Gender": "Female", + "Locale": "es-DO", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ramona Online (Natural) - Spanish (Dominican Republic)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, AndreaNeural)", + "ShortName": "es-EC-AndreaNeural", + "Gender": "Female", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andrea Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-EC, LuisNeural)", + "ShortName": "es-EC-LuisNeural", + "Gender": "Male", + "Locale": "es-EC", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Luis Online (Natural) - Spanish (Ecuador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, LorenaNeural)", + "ShortName": "es-SV-LorenaNeural", + "Gender": "Female", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Lorena Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-SV, RodrigoNeural)", + "ShortName": "es-SV-RodrigoNeural", + "Gender": "Male", + "Locale": "es-SV", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rodrigo Online (Natural) - Spanish (El Salvador)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, JavierNeural)", + "ShortName": "es-GQ-JavierNeural", + "Gender": "Male", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Javier Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GQ, TeresaNeural)", + "ShortName": "es-GQ-TeresaNeural", + "Gender": "Female", + "Locale": "es-GQ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Teresa Online (Natural) - Spanish (Equatorial Guinea)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, AndresNeural)", + "ShortName": "es-GT-AndresNeural", + "Gender": "Male", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Andres Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-GT, MartaNeural)", + "ShortName": "es-GT-MartaNeural", + "Gender": "Female", + "Locale": "es-GT", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Marta Online (Natural) - Spanish (Guatemala)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, CarlosNeural)", + "ShortName": "es-HN-CarlosNeural", + "Gender": "Male", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Carlos Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-HN, KarlaNeural)", + "ShortName": "es-HN-KarlaNeural", + "Gender": "Female", + "Locale": "es-HN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karla Online (Natural) - Spanish (Honduras)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, DaliaNeural)", + "ShortName": "es-MX-DaliaNeural", + "Gender": "Female", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Dalia Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-MX, JorgeNeural)", + "ShortName": "es-MX-JorgeNeural", + "Gender": "Male", + "Locale": "es-MX", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jorge Online (Natural) - Spanish (Mexico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, FedericoNeural)", + "ShortName": "es-NI-FedericoNeural", + "Gender": "Male", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Federico Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-NI, YolandaNeural)", + "ShortName": "es-NI-YolandaNeural", + "Gender": "Female", + "Locale": "es-NI", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Yolanda Online (Natural) - Spanish (Nicaragua)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, MargaritaNeural)", + "ShortName": "es-PA-MargaritaNeural", + "Gender": "Female", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Margarita Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PA, RobertoNeural)", + "ShortName": "es-PA-RobertoNeural", + "Gender": "Male", + "Locale": "es-PA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Roberto Online (Natural) - Spanish (Panama)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, MarioNeural)", + "ShortName": "es-PY-MarioNeural", + "Gender": "Male", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mario Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PY, TaniaNeural)", + "ShortName": "es-PY-TaniaNeural", + "Gender": "Female", + "Locale": "es-PY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tania Online (Natural) - Spanish (Paraguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, AlexNeural)", + "ShortName": "es-PE-AlexNeural", + "Gender": "Male", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alex Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PE, CamilaNeural)", + "ShortName": "es-PE-CamilaNeural", + "Gender": "Female", + "Locale": "es-PE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Camila Online (Natural) - Spanish (Peru)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, KarinaNeural)", + "ShortName": "es-PR-KarinaNeural", + "Gender": "Female", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Karina Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-PR, VictorNeural)", + "ShortName": "es-PR-VictorNeural", + "Gender": "Male", + "Locale": "es-PR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Victor Online (Natural) - Spanish (Puerto Rico)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, AlvaroNeural)", + "ShortName": "es-ES-AlvaroNeural", + "Gender": "Male", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alvaro Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-ES, ElviraNeural)", + "ShortName": "es-ES-ElviraNeural", + "Gender": "Female", + "Locale": "es-ES", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Elvira Online (Natural) - Spanish (Spain)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, AlonsoNeural)", + "ShortName": "es-US-AlonsoNeural", + "Gender": "Male", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Alonso Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-US, PalomaNeural)", + "ShortName": "es-US-PalomaNeural", + "Gender": "Female", + "Locale": "es-US", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paloma Online (Natural) - Spanish (United States)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, MateoNeural)", + "ShortName": "es-UY-MateoNeural", + "Gender": "Male", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mateo Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-UY, ValentinaNeural)", + "ShortName": "es-UY-ValentinaNeural", + "Gender": "Female", + "Locale": "es-UY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valentina Online (Natural) - Spanish (Uruguay)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, PaolaNeural)", + "ShortName": "es-VE-PaolaNeural", + "Gender": "Female", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Paola Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (es-VE, SebastianNeural)", + "ShortName": "es-VE-SebastianNeural", + "Gender": "Male", + "Locale": "es-VE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sebastian Online (Natural) - Spanish (Venezuela)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, JajangNeural)", + "ShortName": "su-ID-JajangNeural", + "Gender": "Male", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Jajang Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (su-ID, TutiNeural)", + "ShortName": "su-ID-TutiNeural", + "Gender": "Female", + "Locale": "su-ID", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Tuti Online (Natural) - Sundanese (Indonesia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, RafikiNeural)", + "ShortName": "sw-KE-RafikiNeural", + "Gender": "Male", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rafiki Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-KE, ZuriNeural)", + "ShortName": "sw-KE-ZuriNeural", + "Gender": "Female", + "Locale": "sw-KE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Zuri Online (Natural) - Swahili (Kenya)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, DaudiNeural)", + "ShortName": "sw-TZ-DaudiNeural", + "Gender": "Male", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Daudi Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sw-TZ, RehemaNeural)", + "ShortName": "sw-TZ-RehemaNeural", + "Gender": "Female", + "Locale": "sw-TZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Rehema Online (Natural) - Swahili (Tanzania)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, MattiasNeural)", + "ShortName": "sv-SE-MattiasNeural", + "Gender": "Male", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mattias Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (sv-SE, SofieNeural)", + "ShortName": "sv-SE-SofieNeural", + "Gender": "Female", + "Locale": "sv-SE", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sofie Online (Natural) - Swedish (Sweden)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, PallaviNeural)", + "ShortName": "ta-IN-PallaviNeural", + "Gender": "Female", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Pallavi Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-IN, ValluvarNeural)", + "ShortName": "ta-IN-ValluvarNeural", + "Gender": "Male", + "Locale": "ta-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Valluvar Online (Natural) - Tamil (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, KaniNeural)", + "ShortName": "ta-MY-KaniNeural", + "Gender": "Female", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kani Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-MY, SuryaNeural)", + "ShortName": "ta-MY-SuryaNeural", + "Gender": "Male", + "Locale": "ta-MY", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Surya Online (Natural) - Tamil (Malaysia)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, AnbuNeural)", + "ShortName": "ta-SG-AnbuNeural", + "Gender": "Male", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Anbu Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-SG, VenbaNeural)", + "ShortName": "ta-SG-VenbaNeural", + "Gender": "Female", + "Locale": "ta-SG", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Venba Online (Natural) - Tamil (Singapore)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, KumarNeural)", + "ShortName": "ta-LK-KumarNeural", + "Gender": "Male", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Kumar Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ta-LK, SaranyaNeural)", + "ShortName": "ta-LK-SaranyaNeural", + "Gender": "Female", + "Locale": "ta-LK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Saranya Online (Natural) - Tamil (Sri Lanka)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, MohanNeural)", + "ShortName": "te-IN-MohanNeural", + "Gender": "Male", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Mohan Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (te-IN, ShrutiNeural)", + "ShortName": "te-IN-ShrutiNeural", + "Gender": "Female", + "Locale": "te-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Shruti Online (Natural) - Telugu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, NiwatNeural)", + "ShortName": "th-TH-NiwatNeural", + "Gender": "Male", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Niwat Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (th-TH, PremwadeeNeural)", + "ShortName": "th-TH-PremwadeeNeural", + "Gender": "Female", + "Locale": "th-TH", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Premwadee Online (Natural) - Thai (Thailand)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, AhmetNeural)", + "ShortName": "tr-TR-AhmetNeural", + "Gender": "Male", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ahmet Online (Natural) - Turkish (Turkey)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (tr-TR, EmelNeural)", + "ShortName": "tr-TR-EmelNeural", + "Gender": "Female", + "Locale": "tr-TR", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Emel Online (Natural) - Turkish (Turkey)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, OstapNeural)", + "ShortName": "uk-UA-OstapNeural", + "Gender": "Male", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Ostap Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uk-UA, PolinaNeural)", + "ShortName": "uk-UA-PolinaNeural", + "Gender": "Female", + "Locale": "uk-UA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Polina Online (Natural) - Ukrainian (Ukraine)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, GulNeural)", + "ShortName": "ur-IN-GulNeural", + "Gender": "Female", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Gul Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-IN, SalmanNeural)", + "ShortName": "ur-IN-SalmanNeural", + "Gender": "Male", + "Locale": "ur-IN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Salman Online (Natural) - Urdu (India)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, AsadNeural)", + "ShortName": "ur-PK-AsadNeural", + "Gender": "Male", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Asad Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (ur-PK, UzmaNeural)", + "ShortName": "ur-PK-UzmaNeural", + "Gender": "Female", + "Locale": "ur-PK", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Uzma Online (Natural) - Urdu (Pakistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, MadinaNeural)", + "ShortName": "uz-UZ-MadinaNeural", + "Gender": "Female", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Madina Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (uz-UZ, SardorNeural)", + "ShortName": "uz-UZ-SardorNeural", + "Gender": "Male", + "Locale": "uz-UZ", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Sardor Online (Natural) - Uzbek (Uzbekistan)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, HoaiMyNeural)", + "ShortName": "vi-VN-HoaiMyNeural", + "Gender": "Female", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft HoaiMy Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (vi-VN, NamMinhNeural)", + "ShortName": "vi-VN-NamMinhNeural", + "Gender": "Male", + "Locale": "vi-VN", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft NamMinh Online (Natural) - Vietnamese (Vietnam)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, AledNeural)", + "ShortName": "cy-GB-AledNeural", + "Gender": "Male", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Aled Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (cy-GB, NiaNeural)", + "ShortName": "cy-GB-NiaNeural", + "Gender": "Female", + "Locale": "cy-GB", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Nia Online (Natural) - Welsh (United Kingdom)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThandoNeural)", + "ShortName": "zu-ZA-ThandoNeural", + "Gender": "Female", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Thando Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + }, + { + "Name": "Microsoft Server Speech Text to Speech Voice (zu-ZA, ThembaNeural)", + "ShortName": "zu-ZA-ThembaNeural", + "Gender": "Male", + "Locale": "zu-ZA", + "SuggestedCodec": "audio-24khz-48kbitrate-mono-mp3", + "FriendlyName": "Microsoft Themba Online (Natural) - Zulu (South Africa)", + "Status": "GA", + "VoiceTag": { + "ContentCategories": [ + "General" + ], + "VoicePersonalities": [ + "Friendly", + "Positive" + ] + } + } +] \ No newline at end of file diff --git a/programs/applio_code/rvc/lib/utils.py b/programs/applio_code/rvc/lib/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c9c5683f2d3fd8d952e111107165a1cd71867260 --- /dev/null +++ b/programs/applio_code/rvc/lib/utils.py @@ -0,0 +1,116 @@ +import os, sys +import librosa +import soundfile as sf +import re +import unicodedata +import wget +from torch import nn + +import logging +from transformers import HubertModel +import warnings + +# Remove this to see warnings about transformers models +warnings.filterwarnings("ignore") + +logging.getLogger("fairseq").setLevel(logging.ERROR) +logging.getLogger("faiss.loader").setLevel(logging.ERROR) +logging.getLogger("transformers").setLevel(logging.ERROR) +logging.getLogger("torch").setLevel(logging.ERROR) + +now_dir = os.getcwd() +sys.path.append(now_dir) + +base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift") +stft = base_path + ".exe" if sys.platform == "win32" else base_path + + +class HubertModelWithFinalProj(HubertModel): + def __init__(self, config): + super().__init__(config) + self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size) + + +def load_audio(file, sample_rate): + try: + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate) + except Exception as error: + raise RuntimeError(f"An error occurred loading the audio: {error}") + + return audio.flatten() + + +def load_audio_infer(file, sample_rate): + file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + if not os.path.isfile(file): + raise FileNotFoundError(f"File not found: {file}") + audio, sr = sf.read(file) + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.T) + if sr != sample_rate: + audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate) + return audio.flatten() + + +def format_title(title): + formatted_title = ( + unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8") + ) + formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title) + formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title) + formatted_title = re.sub(r"\s+", "_", formatted_title) + return formatted_title + + +def load_embedding(embedder_model, custom_embedder=None): + embedder_root = os.path.join( + now_dir, "programs", "applio_code", "rvc", "models", "embedders" + ) + embedding_list = { + "contentvec": os.path.join(embedder_root, "contentvec"), + "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"), + "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"), + "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"), + } + + online_embedders = { + "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin", + "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin", + "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin", + "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin", + } + + config_files = { + "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json", + "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json", + "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json", + "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json", + } + + if embedder_model == "custom": + if os.path.exists(custom_embedder): + model_path = custom_embedder + else: + print(f"Custom embedder not found: {custom_embedder}, using contentvec") + model_path = embedding_list["contentvec"] + else: + model_path = embedding_list[embedder_model] + bin_file = os.path.join(model_path, "pytorch_model.bin") + json_file = os.path.join(model_path, "config.json") + os.makedirs(model_path, exist_ok=True) + if not os.path.exists(bin_file): + url = online_embedders[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=bin_file) + if not os.path.exists(json_file): + url = config_files[embedder_model] + print(f"Downloading {url} to {model_path}...") + wget.download(url, out=json_file) + + models = HubertModelWithFinalProj.from_pretrained(model_path) + return models diff --git a/programs/applio_code/rvc/models/embedders/contentvec/config.json b/programs/applio_code/rvc/models/embedders/contentvec/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5186a71b15933aca2d9942db95e1aff02642d1f0 --- /dev/null +++ b/programs/applio_code/rvc/models/embedders/contentvec/config.json @@ -0,0 +1,71 @@ +{ + "activation_dropout": 0.1, + "apply_spec_augment": true, + "architectures": [ + "HubertModelWithFinalProj" + ], + "attention_dropout": 0.1, + "bos_token_id": 1, + "classifier_proj_size": 256, + "conv_bias": false, + "conv_dim": [ + 512, + 512, + 512, + 512, + 512, + 512, + 512 + ], + "conv_kernel": [ + 10, + 3, + 3, + 3, + 3, + 2, + 2 + ], + "conv_stride": [ + 5, + 2, + 2, + 2, + 2, + 2, + 2 + ], + "ctc_loss_reduction": "sum", + "ctc_zero_infinity": false, + "do_stable_layer_norm": false, + "eos_token_id": 2, + "feat_extract_activation": "gelu", + "feat_extract_norm": "group", + "feat_proj_dropout": 0.0, + "feat_proj_layer_norm": true, + "final_dropout": 0.1, + "hidden_act": "gelu", + "hidden_dropout": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-05, + "layerdrop": 0.1, + "mask_feature_length": 10, + "mask_feature_min_masks": 0, + "mask_feature_prob": 0.0, + "mask_time_length": 10, + "mask_time_min_masks": 2, + "mask_time_prob": 0.05, + "model_type": "hubert", + "num_attention_heads": 12, + "num_conv_pos_embedding_groups": 16, + "num_conv_pos_embeddings": 128, + "num_feat_extract_layers": 7, + "num_hidden_layers": 12, + "pad_token_id": 0, + "torch_dtype": "float32", + "transformers_version": "4.27.3", + "use_weighted_layer_sum": false, + "vocab_size": 32 +} diff --git a/programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin b/programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f03cfb9aa43106344342ef928c119d5ad5f9aa5c --- /dev/null +++ b/programs/applio_code/rvc/models/embedders/contentvec/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e +size 378342945 diff --git a/programs/applio_code/rvc/models/predictors/fcpe.pt b/programs/applio_code/rvc/models/predictors/fcpe.pt new file mode 100644 index 0000000000000000000000000000000000000000..a35dba1e7ece5897aeceff4dea92afaa60ca8244 --- /dev/null +++ b/programs/applio_code/rvc/models/predictors/fcpe.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3a8dd2dbd51baf19ed295006f2ac25dba6dd60adc7ec578ae5fbd94970951da +size 69005189 diff --git a/programs/applio_code/rvc/models/predictors/rmvpe.pt b/programs/applio_code/rvc/models/predictors/rmvpe.pt new file mode 100644 index 0000000000000000000000000000000000000000..6362f060846875c3b5d7012adea5f97e47305e7e --- /dev/null +++ b/programs/applio_code/rvc/models/predictors/rmvpe.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d62215f4306e3ca278246188607209f09af3dc77ed4232efdd069798c4ec193 +size 181184272 diff --git a/programs/music_separation_code/ensemble.py b/programs/music_separation_code/ensemble.py new file mode 100644 index 0000000000000000000000000000000000000000..1ad1ff9fc25aaa3b8b2f3f05b41a912cc0949495 --- /dev/null +++ b/programs/music_separation_code/ensemble.py @@ -0,0 +1,183 @@ +# coding: utf-8 +__author__ = "Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/" + +import os +import librosa +import soundfile as sf +import numpy as np +import argparse + + +def stft(wave, nfft, hl): + wave_left = np.asfortranarray(wave[0]) + wave_right = np.asfortranarray(wave[1]) + spec_left = librosa.stft(wave_left, n_fft=nfft, hop_length=hl) + spec_right = librosa.stft(wave_right, n_fft=nfft, hop_length=hl) + spec = np.asfortranarray([spec_left, spec_right]) + return spec + + +def istft(spec, hl, length): + spec_left = np.asfortranarray(spec[0]) + spec_right = np.asfortranarray(spec[1]) + wave_left = librosa.istft(spec_left, hop_length=hl, length=length) + wave_right = librosa.istft(spec_right, hop_length=hl, length=length) + wave = np.asfortranarray([wave_left, wave_right]) + return wave + + +def absmax(a, *, axis): + dims = list(a.shape) + dims.pop(axis) + indices = np.ogrid[tuple(slice(0, d) for d in dims)] + argmax = np.abs(a).argmax(axis=axis) + indices.insert((len(a.shape) + axis) % len(a.shape), argmax) + return a[tuple(indices)] + + +def absmin(a, *, axis): + dims = list(a.shape) + dims.pop(axis) + indices = np.ogrid[tuple(slice(0, d) for d in dims)] + argmax = np.abs(a).argmin(axis=axis) + indices.insert((len(a.shape) + axis) % len(a.shape), argmax) + return a[tuple(indices)] + + +def lambda_max(arr, axis=None, key=None, keepdims=False): + idxs = np.argmax(key(arr), axis) + if axis is not None: + idxs = np.expand_dims(idxs, axis) + result = np.take_along_axis(arr, idxs, axis) + if not keepdims: + result = np.squeeze(result, axis=axis) + return result + else: + return arr.flatten()[idxs] + + +def lambda_min(arr, axis=None, key=None, keepdims=False): + idxs = np.argmin(key(arr), axis) + if axis is not None: + idxs = np.expand_dims(idxs, axis) + result = np.take_along_axis(arr, idxs, axis) + if not keepdims: + result = np.squeeze(result, axis=axis) + return result + else: + return arr.flatten()[idxs] + + +def average_waveforms(pred_track, weights, algorithm): + """ + :param pred_track: shape = (num, channels, length) + :param weights: shape = (num, ) + :param algorithm: One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft + :return: averaged waveform in shape (channels, length) + """ + + pred_track = np.array(pred_track) + final_length = pred_track.shape[-1] + + mod_track = [] + for i in range(pred_track.shape[0]): + if algorithm == "avg_wave": + mod_track.append(pred_track[i] * weights[i]) + elif algorithm in ["median_wave", "min_wave", "max_wave"]: + mod_track.append(pred_track[i]) + elif algorithm in ["avg_fft", "min_fft", "max_fft", "median_fft"]: + spec = stft(pred_track[i], nfft=2048, hl=1024) + if algorithm in ["avg_fft"]: + mod_track.append(spec * weights[i]) + else: + mod_track.append(spec) + pred_track = np.array(mod_track) + + if algorithm in ["avg_wave"]: + pred_track = pred_track.sum(axis=0) + pred_track /= np.array(weights).sum().T + elif algorithm in ["median_wave"]: + pred_track = np.median(pred_track, axis=0) + elif algorithm in ["min_wave"]: + pred_track = np.array(pred_track) + pred_track = lambda_min(pred_track, axis=0, key=np.abs) + elif algorithm in ["max_wave"]: + pred_track = np.array(pred_track) + pred_track = lambda_max(pred_track, axis=0, key=np.abs) + elif algorithm in ["avg_fft"]: + pred_track = pred_track.sum(axis=0) + pred_track /= np.array(weights).sum() + pred_track = istft(pred_track, 1024, final_length) + elif algorithm in ["min_fft"]: + pred_track = np.array(pred_track) + pred_track = lambda_min(pred_track, axis=0, key=np.abs) + pred_track = istft(pred_track, 1024, final_length) + elif algorithm in ["max_fft"]: + pred_track = np.array(pred_track) + pred_track = absmax(pred_track, axis=0) + pred_track = istft(pred_track, 1024, final_length) + elif algorithm in ["median_fft"]: + pred_track = np.array(pred_track) + pred_track = np.median(pred_track, axis=0) + pred_track = istft(pred_track, 1024, final_length) + return pred_track + + +def ensemble_files(args): + parser = argparse.ArgumentParser() + parser.add_argument( + "--files", + type=str, + required=True, + nargs="+", + help="Path to all audio-files to ensemble", + ) + parser.add_argument( + "--type", + type=str, + default="avg_wave", + help="One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft", + ) + parser.add_argument( + "--weights", + type=float, + nargs="+", + help="Weights to create ensemble. Number of weights must be equal to number of files", + ) + parser.add_argument( + "--output", + default="res.wav", + type=str, + help="Path to wav file where ensemble result will be stored", + ) + if args is None: + args = parser.parse_args() + else: + args = parser.parse_args(args) + + print("Ensemble type: {}".format(args.type)) + print("Number of input files: {}".format(len(args.files))) + if args.weights is not None: + weights = args.weights + else: + weights = np.ones(len(args.files)) + print("Weights: {}".format(weights)) + print("Output file: {}".format(args.output)) + data = [] + for f in args.files: + if not os.path.isfile(f): + print("Error. Can't find file: {}. Check paths.".format(f)) + exit() + print("Reading file: {}".format(f)) + wav, sr = librosa.load(f, sr=None, mono=False) + # wav, sr = sf.read(f) + print("Waveform shape: {} sample rate: {}".format(wav.shape, sr)) + data.append(wav) + data = np.array(data) + res = average_waveforms(data, weights, args.type) + print("Result shape: {}".format(res.shape)) + sf.write(args.output, res.T, sr, "FLOAT") + + +if __name__ == "__main__": + ensemble_files(None) diff --git a/programs/music_separation_code/inference.py b/programs/music_separation_code/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..8c991d457cb6b45ed8df586976db05a6eb887aa9 --- /dev/null +++ b/programs/music_separation_code/inference.py @@ -0,0 +1,246 @@ +# coding: utf-8 +__author__ = "Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/" + +import argparse +import time +import librosa +from tqdm import tqdm +import sys +import os +import glob +import torch +import numpy as np +import soundfile as sf +import torch.nn as nn + +# Using the embedded version of Python can also correctly import the utils module. +current_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(current_dir) +from utils import demix, get_model_from_config + +import warnings + +warnings.filterwarnings("ignore") + + +class Args: + def __init__( + self, + input_file, + store_dir, + model_type, + extract_instrumental, + disable_detailed_pbar, + flac_file, + pcm_type, + use_tta, + ): + self.input_file = input_file + self.model_type = model_type + self.store_dir = store_dir + self.extract_instrumental = extract_instrumental + self.disable_detailed_pbar = disable_detailed_pbar + self.flac_file = flac_file + self.pcm_type = pcm_type + self.use_tta = use_tta + + +def run_file(model, args, config, device, verbose=False): + start_time = time.time() + model.eval() + + if not os.path.isfile(args.input_file): + print("File not found: {}".format(args.input_file)) + return + + instruments = config.training.instruments.copy() + if config.training.target_instrument is not None: + instruments = [config.training.target_instrument] + + if not os.path.isdir(args.store_dir): + os.mkdir(args.store_dir) + + print("Starting processing track: ", args.input_file) + try: + mix, sr = librosa.load(args.input_file, sr=44100, mono=False) + except Exception as e: + print("Cannot read track: {}".format(args.input_file)) + print("Error message: {}".format(str(e))) + return + + # Convert mono to stereo if needed + if len(mix.shape) == 1: + mix = np.stack([mix, mix], axis=0) + + mix_orig = mix.copy() + if "normalize" in config.inference: + if config.inference["normalize"] is True: + mono = mix.mean(0) + mean = mono.mean() + std = mono.std() + mix = (mix - mean) / std + + if args.use_tta: + # orig, channel inverse, polarity inverse + track_proc_list = [mix.copy(), mix[::-1].copy(), -1.0 * mix.copy()] + else: + track_proc_list = [mix.copy()] + + full_result = [] + for mix in track_proc_list: + waveforms = demix( + config, model, mix, device, pbar=verbose, model_type=args.model_type + ) + full_result.append(waveforms) + + # Average all values in single dict + waveforms = full_result[0] + for i in range(1, len(full_result)): + d = full_result[i] + for el in d: + if i == 2: + waveforms[el] += -1.0 * d[el] + elif i == 1: + waveforms[el] += d[el][::-1].copy() + else: + waveforms[el] += d[el] + for el in waveforms: + waveforms[el] = waveforms[el] / len(full_result) + + # Create a new `instr` in instruments list, 'instrumental' + if args.extract_instrumental: + instr = "vocals" if "vocals" in instruments else instruments[0] + instruments.append("instrumental") + # Output "instrumental", which is an inverse of 'vocals' or the first stem in list if 'vocals' absent + waveforms["instrumental"] = mix_orig - waveforms[instr] + + for instr in instruments: + estimates = waveforms[instr].T + if "normalize" in config.inference: + if config.inference["normalize"] is True: + estimates = estimates * std + mean + file_name, _ = os.path.splitext(os.path.basename(args.input_file)) + if args.flac_file: + output_file = os.path.join(args.store_dir, f"{file_name}_{instr}.flac") + subtype = "PCM_16" if args.pcm_type == "PCM_16" else "PCM_24" + sf.write(output_file, estimates, sr, subtype=subtype) + else: + output_file = os.path.join(args.store_dir, f"{file_name}_{instr}.wav") + sf.write(output_file, estimates, sr, subtype="FLOAT") + + time.sleep(1) + print("Elapsed time: {:.2f} sec".format(time.time() - start_time)) + + +def proc_file(args): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_type", + type=str, + default="mdx23c", + help="One of bandit, bandit_v2, bs_roformer, htdemucs, mdx23c, mel_band_roformer, scnet, scnet_unofficial, segm_models, swin_upernet, torchseg", + ) + parser.add_argument("--config_path", type=str, help="path to config file") + parser.add_argument( + "--start_check_point", + type=str, + default="", + help="Initial checkpoint to valid weights", + ) + parser.add_argument( + "--input_file", type=str, help="folder with mixtures to process" + ) + parser.add_argument( + "--store_dir", default="", type=str, help="path to store results as wav file" + ) + parser.add_argument( + "--device_ids", nargs="+", type=int, default=0, help="list of gpu ids" + ) + parser.add_argument( + "--extract_instrumental", + action="store_true", + help="invert vocals to get instrumental if provided", + ) + parser.add_argument( + "--disable_detailed_pbar", + action="store_true", + help="disable detailed progress bar", + ) + parser.add_argument( + "--force_cpu", + action="store_true", + help="Force the use of CPU even if CUDA is available", + ) + parser.add_argument( + "--flac_file", action="store_true", help="Output flac file instead of wav" + ) + parser.add_argument( + "--pcm_type", + type=str, + choices=["PCM_16", "PCM_24"], + default="PCM_24", + help="PCM type for FLAC files (PCM_16 or PCM_24)", + ) + parser.add_argument( + "--use_tta", + action="store_true", + help="Flag adds test time augmentation during inference (polarity and channel inverse). While this triples the runtime, it reduces noise and slightly improves prediction quality.", + ) + if args is None: + args = parser.parse_args() + else: + args = parser.parse_args(args) + + device = "cpu" + if args.force_cpu: + device = "cpu" + elif torch.cuda.is_available(): + print("CUDA is available, use --force_cpu to disable it.") + device = "cuda" + device = ( + f"cuda:{args.device_ids[0]}" + if type(args.device_ids) == list + else f"cuda:{args.device_ids}" + ) + elif torch.backends.mps.is_available(): + device = "mps" + + print("Using device: ", device) + + model_load_start_time = time.time() + torch.backends.cudnn.benchmark = True + + model, config = get_model_from_config(args.model_type, args.config_path) + if args.start_check_point != "": + print("Start from checkpoint: {}".format(args.start_check_point)) + if args.model_type == "htdemucs": + state_dict = torch.load( + args.start_check_point, map_location=device, weights_only=False + ) + # Fix for htdemucs pretrained models + if "state" in state_dict: + state_dict = state_dict["state"] + else: + state_dict = torch.load( + args.start_check_point, map_location=device, weights_only=True + ) + model.load_state_dict(state_dict) + print("Instruments: {}".format(config.training.instruments)) + + # in case multiple CUDA GPUs are used and --device_ids arg is passed + if ( + type(args.device_ids) == list + and len(args.device_ids) > 1 + and not args.force_cpu + ): + model = nn.DataParallel(model, device_ids=args.device_ids) + + model = model.to(device) + + print("Model load time: {:.2f} sec".format(time.time() - model_load_start_time)) + + run_file(model, args, config, device, verbose=True) + + +if __name__ == "__main__": + proc_file(None) diff --git a/programs/music_separation_code/models/bandit/core/__init__.py b/programs/music_separation_code/models/bandit/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..86e1557c945b44d8d74a714d3eff464524469be5 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/__init__.py @@ -0,0 +1,691 @@ +import os.path +from collections import defaultdict +from itertools import chain, combinations +from typing import Any, Dict, Iterator, Mapping, Optional, Tuple, Type, TypedDict + +import pytorch_lightning as pl +import torch +import torchaudio as ta +import torchmetrics as tm +from asteroid import losses as asteroid_losses + +# from deepspeed.ops.adam import DeepSpeedCPUAdam +# from geoopt import optim as gooptim +from pytorch_lightning.utilities.types import STEP_OUTPUT +from torch import nn, optim +from torch.optim import lr_scheduler +from torch.optim.lr_scheduler import LRScheduler + +from models.bandit.core import loss, metrics as metrics_, model +from models.bandit.core.data._types import BatchedDataDict +from models.bandit.core.data.augmentation import BaseAugmentor, StemAugmentor +from models.bandit.core.utils import audio as audio_ +from models.bandit.core.utils.audio import BaseFader + +# from pandas.io.json._normalize import nested_to_record + +ConfigDict = TypedDict("ConfigDict", {"name": str, "kwargs": Dict[str, Any]}) + + +class SchedulerConfigDict(ConfigDict): + monitor: str + + +OptimizerSchedulerConfigDict = TypedDict( + "OptimizerSchedulerConfigDict", + {"optimizer": ConfigDict, "scheduler": SchedulerConfigDict}, + total=False, +) + + +class LRSchedulerReturnDict(TypedDict, total=False): + scheduler: LRScheduler + monitor: str + + +class ConfigureOptimizerReturnDict(TypedDict, total=False): + optimizer: torch.optim.Optimizer + lr_scheduler: LRSchedulerReturnDict + + +OutputType = Dict[str, Any] +MetricsType = Dict[str, torch.Tensor] + + +def get_optimizer_class(name: str) -> Type[optim.Optimizer]: + + if name == "DeepSpeedCPUAdam": + return DeepSpeedCPUAdam + + for module in [optim, gooptim]: + if name in module.__dict__: + return module.__dict__[name] + + raise NameError + + +def parse_optimizer_config( + config: OptimizerSchedulerConfigDict, parameters: Iterator[nn.Parameter] +) -> ConfigureOptimizerReturnDict: + optim_class = get_optimizer_class(config["optimizer"]["name"]) + optimizer = optim_class(parameters, **config["optimizer"]["kwargs"]) + + optim_dict: ConfigureOptimizerReturnDict = { + "optimizer": optimizer, + } + + if "scheduler" in config: + + lr_scheduler_class_ = config["scheduler"]["name"] + lr_scheduler_class = lr_scheduler.__dict__[lr_scheduler_class_] + lr_scheduler_dict: LRSchedulerReturnDict = { + "scheduler": lr_scheduler_class(optimizer, **config["scheduler"]["kwargs"]) + } + + if lr_scheduler_class_ == "ReduceLROnPlateau": + lr_scheduler_dict["monitor"] = config["scheduler"]["monitor"] + + optim_dict["lr_scheduler"] = lr_scheduler_dict + + return optim_dict + + +def parse_model_config(config: ConfigDict) -> Any: + name = config["name"] + + for module in [model]: + if name in module.__dict__: + return module.__dict__[name](**config["kwargs"]) + + raise NameError + + +_LEGACY_LOSS_NAMES = ["HybridL1Loss"] + + +def _parse_legacy_loss_config(config: ConfigDict) -> nn.Module: + name = config["name"] + + if name == "HybridL1Loss": + return loss.TimeFreqL1Loss(**config["kwargs"]) + + raise NameError + + +def parse_loss_config(config: ConfigDict) -> nn.Module: + name = config["name"] + + if name in _LEGACY_LOSS_NAMES: + return _parse_legacy_loss_config(config) + + for module in [loss, nn.modules.loss, asteroid_losses]: + if name in module.__dict__: + # print(config["kwargs"]) + return module.__dict__[name](**config["kwargs"]) + + raise NameError + + +def get_metric(config: ConfigDict) -> tm.Metric: + name = config["name"] + + for module in [tm, metrics_]: + if name in module.__dict__: + return module.__dict__[name](**config["kwargs"]) + raise NameError + + +def parse_metric_config(config: Dict[str, ConfigDict]) -> tm.MetricCollection: + metrics = {} + + for metric in config: + metrics[metric] = get_metric(config[metric]) + + return tm.MetricCollection(metrics) + + +def parse_fader_config(config: ConfigDict) -> BaseFader: + name = config["name"] + + for module in [audio_]: + if name in module.__dict__: + return module.__dict__[name](**config["kwargs"]) + + raise NameError + + +class LightningSystem(pl.LightningModule): + _VOX_STEMS = ["speech", "vocals"] + _BG_STEMS = ["background", "effects", "mne"] + + def __init__( + self, config: Dict, loss_adjustment: float = 1.0, attach_fader: bool = False + ) -> None: + super().__init__() + self.optimizer_config = config["optimizer"] + self.model = parse_model_config(config["model"]) + self.loss = parse_loss_config(config["loss"]) + self.metrics = nn.ModuleDict( + { + stem: parse_metric_config(config["metrics"]["dev"]) + for stem in self.model.stems + } + ) + + self.metrics.disallow_fsdp = True + + self.test_metrics = nn.ModuleDict( + { + stem: parse_metric_config(config["metrics"]["test"]) + for stem in self.model.stems + } + ) + + self.test_metrics.disallow_fsdp = True + + self.fs = config["model"]["kwargs"]["fs"] + + self.fader_config = config["inference"]["fader"] + if attach_fader: + self.fader = parse_fader_config(config["inference"]["fader"]) + else: + self.fader = None + + self.augmentation: Optional[BaseAugmentor] + if config.get("augmentation", None) is not None: + self.augmentation = StemAugmentor(**config["augmentation"]) + else: + self.augmentation = None + + self.predict_output_path: Optional[str] = None + self.loss_adjustment = loss_adjustment + + self.val_prefix = None + self.test_prefix = None + + def configure_optimizers(self) -> Any: + return parse_optimizer_config( + self.optimizer_config, self.trainer.model.parameters() + ) + + def compute_loss( + self, batch: BatchedDataDict, output: OutputType + ) -> Dict[str, torch.Tensor]: + return {"loss": self.loss(output, batch)} + + def update_metrics( + self, batch: BatchedDataDict, output: OutputType, mode: str + ) -> None: + + if mode == "test": + metrics = self.test_metrics + else: + metrics = self.metrics + + for stem, metric in metrics.items(): + + if stem == "mne:+": + stem = "mne" + + # print(f"matching for {stem}") + if mode == "train": + metric.update( + output["audio"][stem], # .cpu(), + batch["audio"][stem], # .cpu() + ) + else: + if stem not in batch["audio"]: + matched = False + if stem in self._VOX_STEMS: + for bstem in self._VOX_STEMS: + if bstem in batch["audio"]: + batch["audio"][stem] = batch["audio"][bstem] + matched = True + break + elif stem in self._BG_STEMS: + for bstem in self._BG_STEMS: + if bstem in batch["audio"]: + batch["audio"][stem] = batch["audio"][bstem] + matched = True + break + else: + matched = True + + # print(batch["audio"].keys()) + + if matched: + # print(f"matched {stem}!") + if stem == "mne" and "mne" not in output["audio"]: + output["audio"]["mne"] = ( + output["audio"]["music"] + output["audio"]["effects"] + ) + + metric.update( + output["audio"][stem], # .cpu(), + batch["audio"][stem], # .cpu(), + ) + + # print(metric.compute()) + + def compute_metrics(self, mode: str = "dev") -> Dict[str, torch.Tensor]: + + if mode == "test": + metrics = self.test_metrics + else: + metrics = self.metrics + + metric_dict = {} + + for stem, metric in metrics.items(): + md = metric.compute() + metric_dict.update({f"{stem}/{k}": v for k, v in md.items()}) + + self.log_dict(metric_dict, prog_bar=True, logger=False) + + return metric_dict + + def reset_metrics(self, test_mode: bool = False) -> None: + + if test_mode: + metrics = self.test_metrics + else: + metrics = self.metrics + + for _, metric in metrics.items(): + metric.reset() + + def forward(self, batch: BatchedDataDict) -> Any: + batch, output = self.model(batch) + + return batch, output + + def common_step(self, batch: BatchedDataDict, mode: str) -> Any: + batch, output = self.forward(batch) + # print(batch) + # print(output) + loss_dict = self.compute_loss(batch, output) + + with torch.no_grad(): + self.update_metrics(batch, output, mode=mode) + + if mode == "train": + self.log("loss", loss_dict["loss"], prog_bar=True) + + return output, loss_dict + + def training_step(self, batch: BatchedDataDict) -> Dict[str, Any]: + + if self.augmentation is not None: + with torch.no_grad(): + batch = self.augmentation(batch) + + _, loss_dict = self.common_step(batch, mode="train") + + with torch.inference_mode(): + self.log_dict_with_prefix( + loss_dict, "train", batch_size=batch["audio"]["mixture"].shape[0] + ) + + loss_dict["loss"] *= self.loss_adjustment + + return loss_dict + + def on_train_batch_end( + self, outputs: STEP_OUTPUT, batch: BatchedDataDict, batch_idx: int + ) -> None: + + metric_dict = self.compute_metrics() + self.log_dict_with_prefix(metric_dict, "train") + self.reset_metrics() + + def validation_step( + self, batch: BatchedDataDict, batch_idx: int, dataloader_idx: int = 0 + ) -> Dict[str, Any]: + + with torch.inference_mode(): + curr_val_prefix = f"val{dataloader_idx}" if dataloader_idx > 0 else "val" + + if curr_val_prefix != self.val_prefix: + # print(f"Switching to validation dataloader {dataloader_idx}") + if self.val_prefix is not None: + self._on_validation_epoch_end() + self.val_prefix = curr_val_prefix + _, loss_dict = self.common_step(batch, mode="val") + + self.log_dict_with_prefix( + loss_dict, + self.val_prefix, + batch_size=batch["audio"]["mixture"].shape[0], + prog_bar=True, + add_dataloader_idx=False, + ) + + return loss_dict + + def on_validation_epoch_end(self) -> None: + self._on_validation_epoch_end() + + def _on_validation_epoch_end(self) -> None: + metric_dict = self.compute_metrics() + self.log_dict_with_prefix( + metric_dict, self.val_prefix, prog_bar=True, add_dataloader_idx=False + ) + # self.logger.save() + # print(self.val_prefix, "Validation metrics:", metric_dict) + self.reset_metrics() + + def old_predtest_step( + self, batch: BatchedDataDict, batch_idx: int, dataloader_idx: int = 0 + ) -> Tuple[BatchedDataDict, OutputType]: + + audio_batch = batch["audio"]["mixture"] + track_batch = batch.get("track", ["" for _ in range(len(audio_batch))]) + + output_list_of_dicts = [ + self.fader(audio[None, ...], lambda a: self.test_forward(a, track)) + for audio, track in zip(audio_batch, track_batch) + ] + + output_dict_of_lists = defaultdict(list) + + for output_dict in output_list_of_dicts: + for stem, audio in output_dict.items(): + output_dict_of_lists[stem].append(audio) + + output = { + "audio": { + stem: torch.concat(output_list, dim=0) + for stem, output_list in output_dict_of_lists.items() + } + } + + return batch, output + + def predtest_step( + self, batch: BatchedDataDict, batch_idx: int = -1, dataloader_idx: int = 0 + ) -> Tuple[BatchedDataDict, OutputType]: + + if getattr(self.model, "bypass_fader", False): + batch, output = self.model(batch) + else: + audio_batch = batch["audio"]["mixture"] + output = self.fader( + audio_batch, lambda a: self.test_forward(a, "", batch=batch) + ) + + return batch, output + + def test_forward( + self, audio: torch.Tensor, track: str = "", batch: BatchedDataDict = None + ) -> torch.Tensor: + + if self.fader is None: + self.attach_fader() + + cond = batch.get("condition", None) + + if cond is not None and cond.shape[0] == 1: + cond = cond.repeat(audio.shape[0], 1) + + _, output = self.forward( + { + "audio": {"mixture": audio}, + "track": track, + "condition": cond, + } + ) # TODO: support track properly + + return output["audio"] + + def on_test_epoch_start(self) -> None: + self.attach_fader(force_reattach=True) + + def test_step( + self, batch: BatchedDataDict, batch_idx: int, dataloader_idx: int = 0 + ) -> Any: + curr_test_prefix = f"test{dataloader_idx}" + + # print(batch["audio"].keys()) + + if curr_test_prefix != self.test_prefix: + # print(f"Switching to test dataloader {dataloader_idx}") + if self.test_prefix is not None: + self._on_test_epoch_end() + self.test_prefix = curr_test_prefix + + with torch.inference_mode(): + _, output = self.predtest_step(batch, batch_idx, dataloader_idx) + # print(output) + self.update_metrics(batch, output, mode="test") + + return output + + def on_test_epoch_end(self) -> None: + self._on_test_epoch_end() + + def _on_test_epoch_end(self) -> None: + metric_dict = self.compute_metrics(mode="test") + self.log_dict_with_prefix( + metric_dict, self.test_prefix, prog_bar=True, add_dataloader_idx=False + ) + # self.logger.save() + # print(self.test_prefix, "Test metrics:", metric_dict) + self.reset_metrics() + + def predict_step( + self, + batch: BatchedDataDict, + batch_idx: int = 0, + dataloader_idx: int = 0, + include_track_name: Optional[bool] = None, + get_no_vox_combinations: bool = True, + get_residual: bool = False, + treat_batch_as_channels: bool = False, + fs: Optional[int] = None, + ) -> Any: + assert self.predict_output_path is not None + + batch_size = batch["audio"]["mixture"].shape[0] + + if include_track_name is None: + include_track_name = batch_size > 1 + + with torch.inference_mode(): + batch, output = self.predtest_step(batch, batch_idx, dataloader_idx) + print("Pred test finished...") + torch.cuda.empty_cache() + metric_dict = {} + + if get_residual: + mixture = batch["audio"]["mixture"] + extracted = sum([output["audio"][stem] for stem in output["audio"]]) + residual = mixture - extracted + print(extracted.shape, mixture.shape, residual.shape) + + output["audio"]["residual"] = residual + + if get_no_vox_combinations: + no_vox_stems = [ + stem for stem in output["audio"] if stem not in self._VOX_STEMS + ] + no_vox_combinations = chain.from_iterable( + combinations(no_vox_stems, r) for r in range(2, len(no_vox_stems) + 1) + ) + + for combination in no_vox_combinations: + combination_ = list(combination) + output["audio"]["+".join(combination_)] = sum( + [output["audio"][stem] for stem in combination_] + ) + + if treat_batch_as_channels: + for stem in output["audio"]: + output["audio"][stem] = output["audio"][stem].reshape( + 1, -1, output["audio"][stem].shape[-1] + ) + batch_size = 1 + + for b in range(batch_size): + print("!!", b) + for stem in output["audio"]: + print(f"Saving audio for {stem} to {self.predict_output_path}") + track_name = batch["track"][b].split("/")[-1] + + if batch.get("audio", {}).get(stem, None) is not None: + self.test_metrics[stem].reset() + metrics = self.test_metrics[stem]( + batch["audio"][stem][[b], ...], output["audio"][stem][[b], ...] + ) + snr = metrics["snr"] + sisnr = metrics["sisnr"] + sdr = metrics["sdr"] + metric_dict[stem] = metrics + print( + track_name, + f"snr={snr:2.2f} dB", + f"sisnr={sisnr:2.2f}", + f"sdr={sdr:2.2f} dB", + ) + filename = f"{stem} - snr={snr:2.2f}dB - sdr={sdr:2.2f}dB.wav" + else: + filename = f"{stem}.wav" + + if include_track_name: + output_dir = os.path.join(self.predict_output_path, track_name) + else: + output_dir = self.predict_output_path + + os.makedirs(output_dir, exist_ok=True) + + if fs is None: + fs = self.fs + + ta.save( + os.path.join(output_dir, filename), + output["audio"][stem][b, ...].cpu(), + fs, + ) + + return metric_dict + + def get_stems( + self, + batch: BatchedDataDict, + batch_idx: int = 0, + dataloader_idx: int = 0, + include_track_name: Optional[bool] = None, + get_no_vox_combinations: bool = True, + get_residual: bool = False, + treat_batch_as_channels: bool = False, + fs: Optional[int] = None, + ) -> Any: + assert self.predict_output_path is not None + + batch_size = batch["audio"]["mixture"].shape[0] + + if include_track_name is None: + include_track_name = batch_size > 1 + + with torch.inference_mode(): + batch, output = self.predtest_step(batch, batch_idx, dataloader_idx) + torch.cuda.empty_cache() + metric_dict = {} + + if get_residual: + mixture = batch["audio"]["mixture"] + extracted = sum([output["audio"][stem] for stem in output["audio"]]) + residual = mixture - extracted + # print(extracted.shape, mixture.shape, residual.shape) + + output["audio"]["residual"] = residual + + if get_no_vox_combinations: + no_vox_stems = [ + stem for stem in output["audio"] if stem not in self._VOX_STEMS + ] + no_vox_combinations = chain.from_iterable( + combinations(no_vox_stems, r) for r in range(2, len(no_vox_stems) + 1) + ) + + for combination in no_vox_combinations: + combination_ = list(combination) + output["audio"]["+".join(combination_)] = sum( + [output["audio"][stem] for stem in combination_] + ) + + if treat_batch_as_channels: + for stem in output["audio"]: + output["audio"][stem] = output["audio"][stem].reshape( + 1, -1, output["audio"][stem].shape[-1] + ) + batch_size = 1 + + result = {} + for b in range(batch_size): + for stem in output["audio"]: + track_name = batch["track"][b].split("/")[-1] + + if batch.get("audio", {}).get(stem, None) is not None: + self.test_metrics[stem].reset() + metrics = self.test_metrics[stem]( + batch["audio"][stem][[b], ...], output["audio"][stem][[b], ...] + ) + snr = metrics["snr"] + sisnr = metrics["sisnr"] + sdr = metrics["sdr"] + metric_dict[stem] = metrics + print( + track_name, + f"snr={snr:2.2f} dB", + f"sisnr={sisnr:2.2f}", + f"sdr={sdr:2.2f} dB", + ) + filename = f"{stem} - snr={snr:2.2f}dB - sdr={sdr:2.2f}dB.wav" + else: + filename = f"{stem}.wav" + + if include_track_name: + output_dir = os.path.join(self.predict_output_path, track_name) + else: + output_dir = self.predict_output_path + + os.makedirs(output_dir, exist_ok=True) + + if fs is None: + fs = self.fs + + result[stem] = output["audio"][stem][b, ...].cpu().numpy() + + return result + + def load_state_dict( + self, state_dict: Mapping[str, Any], strict: bool = False + ) -> Any: + + return super().load_state_dict(state_dict, strict=False) + + def set_predict_output_path(self, path: str) -> None: + self.predict_output_path = path + os.makedirs(self.predict_output_path, exist_ok=True) + + self.attach_fader() + + def attach_fader(self, force_reattach=False) -> None: + if self.fader is None or force_reattach: + self.fader = parse_fader_config(self.fader_config) + self.fader.to(self.device) + + def log_dict_with_prefix( + self, + dict_: Dict[str, torch.Tensor], + prefix: str, + batch_size: Optional[int] = None, + **kwargs: Any, + ) -> None: + self.log_dict( + {f"{prefix}/{k}": v for k, v in dict_.items()}, + batch_size=batch_size, + logger=True, + sync_dist=True, + **kwargs, + ) diff --git a/programs/music_separation_code/models/bandit/core/data/__init__.py b/programs/music_separation_code/models/bandit/core/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9d4d672bd3b6ad90a26e19ee6c26e02ee3be84c --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/__init__.py @@ -0,0 +1,2 @@ +from .dnr.datamodule import DivideAndRemasterDataModule +from .musdb.datamodule import MUSDB18DataModule diff --git a/programs/music_separation_code/models/bandit/core/data/_types.py b/programs/music_separation_code/models/bandit/core/data/_types.py new file mode 100644 index 0000000000000000000000000000000000000000..65e4607a558e6b6a65ee68de883b69e282f8fcf4 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/_types.py @@ -0,0 +1,17 @@ +from typing import Dict, Sequence, TypedDict + +import torch + +AudioDict = Dict[str, torch.Tensor] + +DataDict = TypedDict("DataDict", {"audio": AudioDict, "track": str}) + +BatchedDataDict = TypedDict( + "BatchedDataDict", {"audio": AudioDict, "track": Sequence[str]} +) + + +class DataDictWithLanguage(TypedDict): + audio: AudioDict + track: str + language: str diff --git a/programs/music_separation_code/models/bandit/core/data/augmentation.py b/programs/music_separation_code/models/bandit/core/data/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..1aa2a9cfd1171524d67bd4bbb94a09599ea6d9e2 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/augmentation.py @@ -0,0 +1,102 @@ +from abc import ABC +from typing import Any, Dict, Union + +import torch +import torch_audiomentations as tam +from torch import nn + +from models.bandit.core.data._types import BatchedDataDict, DataDict + + +class BaseAugmentor(nn.Module, ABC): + def forward( + self, item: Union[DataDict, BatchedDataDict] + ) -> Union[DataDict, BatchedDataDict]: + raise NotImplementedError + + +class StemAugmentor(BaseAugmentor): + def __init__( + self, + audiomentations: Dict[str, Dict[str, Any]], + fix_clipping: bool = True, + scaler_margin: float = 0.5, + apply_both_default_and_common: bool = False, + ) -> None: + super().__init__() + + augmentations = {} + + self.has_default = "[default]" in audiomentations + self.has_common = "[common]" in audiomentations + self.apply_both_default_and_common = apply_both_default_and_common + + for stem in audiomentations: + if audiomentations[stem]["name"] == "Compose": + augmentations[stem] = getattr(tam, audiomentations[stem]["name"])( + [ + getattr(tam, aug["name"])(**aug["kwargs"]) + for aug in audiomentations[stem]["kwargs"]["transforms"] + ], + **audiomentations[stem]["kwargs"]["kwargs"], + ) + else: + augmentations[stem] = getattr(tam, audiomentations[stem]["name"])( + **audiomentations[stem]["kwargs"] + ) + + self.augmentations = nn.ModuleDict(augmentations) + self.fix_clipping = fix_clipping + self.scaler_margin = scaler_margin + + def check_and_fix_clipping( + self, item: Union[DataDict, BatchedDataDict] + ) -> Union[DataDict, BatchedDataDict]: + max_abs = [] + + for stem in item["audio"]: + max_abs.append(item["audio"][stem].abs().max().item()) + + if max(max_abs) > 1.0: + scaler = 1.0 / ( + max(max_abs) + + torch.rand((1,), device=item["audio"]["mixture"].device) + * self.scaler_margin + ) + + for stem in item["audio"]: + item["audio"][stem] *= scaler + + return item + + def forward( + self, item: Union[DataDict, BatchedDataDict] + ) -> Union[DataDict, BatchedDataDict]: + + for stem in item["audio"]: + if stem == "mixture": + continue + + if self.has_common: + item["audio"][stem] = self.augmentations["[common]"]( + item["audio"][stem] + ).samples + + if stem in self.augmentations: + item["audio"][stem] = self.augmentations[stem]( + item["audio"][stem] + ).samples + elif self.has_default: + if not self.has_common or self.apply_both_default_and_common: + item["audio"][stem] = self.augmentations["[default]"]( + item["audio"][stem] + ).samples + + item["audio"]["mixture"] = sum( + [item["audio"][stem] for stem in item["audio"] if stem != "mixture"] + ) # type: ignore[call-overload, assignment] + + if self.fix_clipping: + item = self.check_and_fix_clipping(item) + + return item diff --git a/programs/music_separation_code/models/bandit/core/data/augmented.py b/programs/music_separation_code/models/bandit/core/data/augmented.py new file mode 100644 index 0000000000000000000000000000000000000000..3c0524409bf99b009605989eba5d5f46f0560f2e --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/augmented.py @@ -0,0 +1,34 @@ +import warnings +from typing import Dict, Optional, Union + +import torch +from torch import nn +from torch.utils import data + + +class AugmentedDataset(data.Dataset): + def __init__( + self, + dataset: data.Dataset, + augmentation: nn.Module = nn.Identity(), + target_length: Optional[int] = None, + ) -> None: + warnings.warn( + "This class is no longer used. Attach augmentation to " + "the LightningSystem instead.", + DeprecationWarning, + ) + + self.dataset = dataset + self.augmentation = augmentation + + self.ds_length: int = len(dataset) # type: ignore[arg-type] + self.length = target_length if target_length is not None else self.ds_length + + def __getitem__(self, index: int) -> Dict[str, Union[str, Dict[str, torch.Tensor]]]: + item = self.dataset[index % self.ds_length] + item = self.augmentation(item) + return item + + def __len__(self) -> int: + return self.length diff --git a/programs/music_separation_code/models/bandit/core/data/base.py b/programs/music_separation_code/models/bandit/core/data/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c51d57f5d48836b788b3bcc708e3f8596461d56f --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/base.py @@ -0,0 +1,60 @@ +import os +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + +import numpy as np +import pedalboard as pb +import torch +import torchaudio as ta +from torch.utils import data + +from models.bandit.core.data._types import AudioDict, DataDict + + +class BaseSourceSeparationDataset(data.Dataset, ABC): + def __init__( + self, + split: str, + stems: List[str], + files: List[str], + data_path: str, + fs: int, + npy_memmap: bool, + recompute_mixture: bool, + ): + self.split = split + self.stems = stems + self.stems_no_mixture = [s for s in stems if s != "mixture"] + self.files = files + self.data_path = data_path + self.fs = fs + self.npy_memmap = npy_memmap + self.recompute_mixture = recompute_mixture + + @abstractmethod + def get_stem(self, *, stem: str, identifier: Dict[str, Any]) -> torch.Tensor: + raise NotImplementedError + + def _get_audio(self, stems, identifier: Dict[str, Any]): + audio = {} + for stem in stems: + audio[stem] = self.get_stem(stem=stem, identifier=identifier) + + return audio + + def get_audio(self, identifier: Dict[str, Any]) -> AudioDict: + + if self.recompute_mixture: + audio = self._get_audio(self.stems_no_mixture, identifier=identifier) + audio["mixture"] = self.compute_mixture(audio) + return audio + else: + return self._get_audio(self.stems, identifier=identifier) + + @abstractmethod + def get_identifier(self, index: int) -> Dict[str, Any]: + pass + + def compute_mixture(self, audio: AudioDict) -> torch.Tensor: + + return sum(audio[stem] for stem in audio if stem != "mixture") diff --git a/programs/music_separation_code/models/bandit/core/data/dnr/__init__.py b/programs/music_separation_code/models/bandit/core/data/dnr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/programs/music_separation_code/models/bandit/core/data/dnr/datamodule.py b/programs/music_separation_code/models/bandit/core/data/dnr/datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..2971d419d433e335668f9e52cf54afda55a48f88 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/dnr/datamodule.py @@ -0,0 +1,68 @@ +import os +from typing import Mapping, Optional + +import pytorch_lightning as pl + +from .dataset import ( + DivideAndRemasterDataset, + DivideAndRemasterDeterministicChunkDataset, + DivideAndRemasterRandomChunkDataset, + DivideAndRemasterRandomChunkDatasetWithSpeechReverb, +) + + +def DivideAndRemasterDataModule( + data_root: str = "$DATA_ROOT/DnR/v2", + batch_size: int = 2, + num_workers: int = 8, + train_kwargs: Optional[Mapping] = None, + val_kwargs: Optional[Mapping] = None, + test_kwargs: Optional[Mapping] = None, + datamodule_kwargs: Optional[Mapping] = None, + use_speech_reverb: bool = False, + # augmentor=None +) -> pl.LightningDataModule: + if train_kwargs is None: + train_kwargs = {} + + if val_kwargs is None: + val_kwargs = {} + + if test_kwargs is None: + test_kwargs = {} + + if datamodule_kwargs is None: + datamodule_kwargs = {} + + if num_workers is None: + num_workers = os.cpu_count() + + if num_workers is None: + num_workers = 32 + + num_workers = min(num_workers, 64) + + if use_speech_reverb: + train_cls = DivideAndRemasterRandomChunkDatasetWithSpeechReverb + else: + train_cls = DivideAndRemasterRandomChunkDataset + + train_dataset = train_cls(data_root, "train", **train_kwargs) + + # if augmentor is not None: + # train_dataset = AugmentedDataset(train_dataset, augmentor) + + datamodule = pl.LightningDataModule.from_datasets( + train_dataset=train_dataset, + val_dataset=DivideAndRemasterDeterministicChunkDataset( + data_root, "val", **val_kwargs + ), + test_dataset=DivideAndRemasterDataset(data_root, "test", **test_kwargs), + batch_size=batch_size, + num_workers=num_workers, + **datamodule_kwargs + ) + + datamodule.predict_dataloader = datamodule.test_dataloader # type: ignore[method-assign] + + return datamodule diff --git a/programs/music_separation_code/models/bandit/core/data/dnr/dataset.py b/programs/music_separation_code/models/bandit/core/data/dnr/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..00142c7b96c4d04da138005055acda6a99aa1df1 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/dnr/dataset.py @@ -0,0 +1,366 @@ +import os +from abc import ABC +from typing import Any, Dict, List, Optional + +import numpy as np +import pedalboard as pb +import torch +import torchaudio as ta +from torch.utils import data + +from models.bandit.core.data._types import AudioDict, DataDict +from models.bandit.core.data.base import BaseSourceSeparationDataset + + +class DivideAndRemasterBaseDataset(BaseSourceSeparationDataset, ABC): + ALLOWED_STEMS = ["mixture", "speech", "music", "effects", "mne"] + STEM_NAME_MAP = { + "mixture": "mix", + "speech": "speech", + "music": "music", + "effects": "sfx", + } + SPLIT_NAME_MAP = {"train": "tr", "val": "cv", "test": "tt"} + + FULL_TRACK_LENGTH_SECOND = 60 + FULL_TRACK_LENGTH_SAMPLES = FULL_TRACK_LENGTH_SECOND * 44100 + + def __init__( + self, + split: str, + stems: List[str], + files: List[str], + data_path: str, + fs: int = 44100, + npy_memmap: bool = True, + recompute_mixture: bool = False, + ) -> None: + super().__init__( + split=split, + stems=stems, + files=files, + data_path=data_path, + fs=fs, + npy_memmap=npy_memmap, + recompute_mixture=recompute_mixture, + ) + + def get_stem(self, *, stem: str, identifier: Dict[str, Any]) -> torch.Tensor: + + if stem == "mne": + return self.get_stem(stem="music", identifier=identifier) + self.get_stem( + stem="effects", identifier=identifier + ) + + track = identifier["track"] + path = os.path.join(self.data_path, track) + + if self.npy_memmap: + audio = np.load( + os.path.join(path, f"{self.STEM_NAME_MAP[stem]}.npy"), mmap_mode="r" + ) + else: + # noinspection PyUnresolvedReferences + audio, _ = ta.load(os.path.join(path, f"{self.STEM_NAME_MAP[stem]}.wav")) + + return audio + + def get_identifier(self, index): + return dict(track=self.files[index]) + + def __getitem__(self, index: int) -> DataDict: + identifier = self.get_identifier(index) + audio = self.get_audio(identifier) + + return {"audio": audio, "track": f"{self.split}/{identifier['track']}"} + + +class DivideAndRemasterDataset(DivideAndRemasterBaseDataset): + def __init__( + self, + data_root: str, + split: str, + stems: Optional[List[str]] = None, + fs: int = 44100, + npy_memmap: bool = True, + ) -> None: + + if stems is None: + stems = self.ALLOWED_STEMS + self.stems = stems + + data_path = os.path.join(data_root, self.SPLIT_NAME_MAP[split]) + + files = sorted(os.listdir(data_path)) + files = [ + f + for f in files + if (not f.startswith(".")) and os.path.isdir(os.path.join(data_path, f)) + ] + # pprint(list(enumerate(files))) + if split == "train": + assert len(files) == 3406, len(files) + elif split == "val": + assert len(files) == 487, len(files) + elif split == "test": + assert len(files) == 973, len(files) + + self.n_tracks = len(files) + + super().__init__( + data_path=data_path, + split=split, + stems=stems, + files=files, + fs=fs, + npy_memmap=npy_memmap, + ) + + def __len__(self) -> int: + return self.n_tracks + + +class DivideAndRemasterRandomChunkDataset(DivideAndRemasterBaseDataset): + def __init__( + self, + data_root: str, + split: str, + target_length: int, + chunk_size_second: float, + stems: Optional[List[str]] = None, + fs: int = 44100, + npy_memmap: bool = True, + ) -> None: + + if stems is None: + stems = self.ALLOWED_STEMS + self.stems = stems + + data_path = os.path.join(data_root, self.SPLIT_NAME_MAP[split]) + + files = sorted(os.listdir(data_path)) + files = [ + f + for f in files + if (not f.startswith(".")) and os.path.isdir(os.path.join(data_path, f)) + ] + + if split == "train": + assert len(files) == 3406, len(files) + elif split == "val": + assert len(files) == 487, len(files) + elif split == "test": + assert len(files) == 973, len(files) + + self.n_tracks = len(files) + + self.target_length = target_length + self.chunk_size = int(chunk_size_second * fs) + + super().__init__( + data_path=data_path, + split=split, + stems=stems, + files=files, + fs=fs, + npy_memmap=npy_memmap, + ) + + def __len__(self) -> int: + return self.target_length + + def get_identifier(self, index): + return super().get_identifier(index % self.n_tracks) + + def get_stem( + self, + *, + stem: str, + identifier: Dict[str, Any], + chunk_here: bool = False, + ) -> torch.Tensor: + + stem = super().get_stem(stem=stem, identifier=identifier) + + if chunk_here: + start = np.random.randint( + 0, self.FULL_TRACK_LENGTH_SAMPLES - self.chunk_size + ) + end = start + self.chunk_size + + stem = stem[:, start:end] + + return stem + + def __getitem__(self, index: int) -> DataDict: + identifier = self.get_identifier(index) + # self.index_lock = index + audio = self.get_audio(identifier) + # self.index_lock = None + + start = np.random.randint(0, self.FULL_TRACK_LENGTH_SAMPLES - self.chunk_size) + end = start + self.chunk_size + + audio = {k: v[:, start:end] for k, v in audio.items()} + + return {"audio": audio, "track": f"{self.split}/{identifier['track']}"} + + +class DivideAndRemasterDeterministicChunkDataset(DivideAndRemasterBaseDataset): + def __init__( + self, + data_root: str, + split: str, + chunk_size_second: float, + hop_size_second: float, + stems: Optional[List[str]] = None, + fs: int = 44100, + npy_memmap: bool = True, + ) -> None: + + if stems is None: + stems = self.ALLOWED_STEMS + self.stems = stems + + data_path = os.path.join(data_root, self.SPLIT_NAME_MAP[split]) + + files = sorted(os.listdir(data_path)) + files = [ + f + for f in files + if (not f.startswith(".")) and os.path.isdir(os.path.join(data_path, f)) + ] + # pprint(list(enumerate(files))) + if split == "train": + assert len(files) == 3406, len(files) + elif split == "val": + assert len(files) == 487, len(files) + elif split == "test": + assert len(files) == 973, len(files) + + self.n_tracks = len(files) + + self.chunk_size = int(chunk_size_second * fs) + self.hop_size = int(hop_size_second * fs) + self.n_chunks_per_track = int( + (self.FULL_TRACK_LENGTH_SECOND - chunk_size_second) / hop_size_second + ) + + self.length = self.n_tracks * self.n_chunks_per_track + + super().__init__( + data_path=data_path, + split=split, + stems=stems, + files=files, + fs=fs, + npy_memmap=npy_memmap, + ) + + def get_identifier(self, index): + return super().get_identifier(index % self.n_tracks) + + def __len__(self) -> int: + return self.length + + def __getitem__(self, item: int) -> DataDict: + + index = item % self.n_tracks + chunk = item // self.n_tracks + + data_ = super().__getitem__(index) + + audio = data_["audio"] + + start = chunk * self.hop_size + end = start + self.chunk_size + + for stem in self.stems: + data_["audio"][stem] = audio[stem][:, start:end] + + return data_ + + +class DivideAndRemasterRandomChunkDatasetWithSpeechReverb( + DivideAndRemasterRandomChunkDataset +): + def __init__( + self, + data_root: str, + split: str, + target_length: int, + chunk_size_second: float, + stems: Optional[List[str]] = None, + fs: int = 44100, + npy_memmap: bool = True, + ) -> None: + + if stems is None: + stems = self.ALLOWED_STEMS + + stems_no_mixture = [s for s in stems if s != "mixture"] + + super().__init__( + data_root=data_root, + split=split, + target_length=target_length, + chunk_size_second=chunk_size_second, + stems=stems_no_mixture, + fs=fs, + npy_memmap=npy_memmap, + ) + + self.stems = stems + self.stems_no_mixture = stems_no_mixture + + def __getitem__(self, index: int) -> DataDict: + + data_ = super().__getitem__(index) + + dry = data_["audio"]["speech"][:] + n_samples = dry.shape[-1] + + wet_level = np.random.rand() + + speech = pb.Reverb( + room_size=np.random.rand(), + damping=np.random.rand(), + wet_level=wet_level, + dry_level=(1 - wet_level), + width=np.random.rand(), + ).process(dry, self.fs, buffer_size=8192 * 4)[..., :n_samples] + + data_["audio"]["speech"] = speech + + data_["audio"]["mixture"] = sum( + [data_["audio"][s] for s in self.stems_no_mixture] + ) + + return data_ + + def __len__(self) -> int: + return super().__len__() + + +if __name__ == "__main__": + + from pprint import pprint + from tqdm import tqdm + + for split_ in ["train", "val", "test"]: + ds = DivideAndRemasterRandomChunkDatasetWithSpeechReverb( + data_root="$DATA_ROOT/DnR/v2np", + split=split_, + target_length=100, + chunk_size_second=6.0, + ) + + print(split_, len(ds)) + + for track_ in tqdm(ds): # type: ignore + pprint(track_) + track_["audio"] = {k: v.shape for k, v in track_["audio"].items()} + pprint(track_) + # break + + break diff --git a/programs/music_separation_code/models/bandit/core/data/dnr/preprocess.py b/programs/music_separation_code/models/bandit/core/data/dnr/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..18d68b18fbe963647df1253190625ea639035572 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/dnr/preprocess.py @@ -0,0 +1,51 @@ +import glob +import os +from typing import Tuple + +import numpy as np +import torchaudio as ta +from tqdm.contrib.concurrent import process_map + + +def process_one(inputs: Tuple[str, str, int]) -> None: + infile, outfile, target_fs = inputs + + dir = os.path.dirname(outfile) + os.makedirs(dir, exist_ok=True) + + data, fs = ta.load(infile) + + if fs != target_fs: + data = ta.functional.resample( + data, fs, target_fs, resampling_method="sinc_interp_kaiser" + ) + fs = target_fs + + data = data.numpy() + data = data.astype(np.float32) + + if os.path.exists(outfile): + data_ = np.load(outfile) + if np.allclose(data, data_): + return + + np.save(outfile, data) + + +def preprocess(data_path: str, output_path: str, fs: int) -> None: + files = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True) + print(files) + outfiles = [ + f.replace(data_path, output_path).replace(".wav", ".npy") for f in files + ] + + os.makedirs(output_path, exist_ok=True) + inputs = list(zip(files, outfiles, [fs] * len(files))) + + process_map(process_one, inputs, chunksize=32) + + +if __name__ == "__main__": + import fire + + fire.Fire() diff --git a/programs/music_separation_code/models/bandit/core/data/musdb/__init__.py b/programs/music_separation_code/models/bandit/core/data/musdb/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/programs/music_separation_code/models/bandit/core/data/musdb/datamodule.py b/programs/music_separation_code/models/bandit/core/data/musdb/datamodule.py new file mode 100644 index 0000000000000000000000000000000000000000..7b3c25e5a3fba968753e6296889843fb87c30d22 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/musdb/datamodule.py @@ -0,0 +1,75 @@ +import os.path +from typing import Mapping, Optional + +import pytorch_lightning as pl + +from models.bandit.core.data.musdb.dataset import ( + MUSDB18BaseDataset, + MUSDB18FullTrackDataset, + MUSDB18SadDataset, + MUSDB18SadOnTheFlyAugmentedDataset, +) + + +def MUSDB18DataModule( + data_root: str = "$DATA_ROOT/MUSDB18/HQ", + target_stem: str = "vocals", + batch_size: int = 2, + num_workers: int = 8, + train_kwargs: Optional[Mapping] = None, + val_kwargs: Optional[Mapping] = None, + test_kwargs: Optional[Mapping] = None, + datamodule_kwargs: Optional[Mapping] = None, + use_on_the_fly: bool = True, + npy_memmap: bool = True, +) -> pl.LightningDataModule: + if train_kwargs is None: + train_kwargs = {} + + if val_kwargs is None: + val_kwargs = {} + + if test_kwargs is None: + test_kwargs = {} + + if datamodule_kwargs is None: + datamodule_kwargs = {} + + train_dataset: MUSDB18BaseDataset + + if use_on_the_fly: + train_dataset = MUSDB18SadOnTheFlyAugmentedDataset( + data_root=os.path.join(data_root, "saded-np"), + split="train", + target_stem=target_stem, + **train_kwargs + ) + else: + train_dataset = MUSDB18SadDataset( + data_root=os.path.join(data_root, "saded-np"), + split="train", + target_stem=target_stem, + **train_kwargs + ) + + datamodule = pl.LightningDataModule.from_datasets( + train_dataset=train_dataset, + val_dataset=MUSDB18SadDataset( + data_root=os.path.join(data_root, "saded-np"), + split="val", + target_stem=target_stem, + **val_kwargs + ), + test_dataset=MUSDB18FullTrackDataset( + data_root=os.path.join(data_root, "canonical"), split="test", **test_kwargs + ), + batch_size=batch_size, + num_workers=num_workers, + **datamodule_kwargs + ) + + datamodule.predict_dataloader = ( # type: ignore[method-assign] + datamodule.test_dataloader + ) + + return datamodule diff --git a/programs/music_separation_code/models/bandit/core/data/musdb/dataset.py b/programs/music_separation_code/models/bandit/core/data/musdb/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f66319f03c7ab57c479ec9918b5d0c05ded0a646 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/musdb/dataset.py @@ -0,0 +1,273 @@ +import os +from abc import ABC +from typing import List, Optional, Tuple + +import numpy as np +import torch +import torchaudio as ta +from torch.utils import data + +from models.bandit.core.data._types import AudioDict, DataDict +from models.bandit.core.data.base import BaseSourceSeparationDataset + + +class MUSDB18BaseDataset(BaseSourceSeparationDataset, ABC): + + ALLOWED_STEMS = ["mixture", "vocals", "bass", "drums", "other"] + + def __init__( + self, + split: str, + stems: List[str], + files: List[str], + data_path: str, + fs: int = 44100, + npy_memmap=False, + ) -> None: + super().__init__( + split=split, + stems=stems, + files=files, + data_path=data_path, + fs=fs, + npy_memmap=npy_memmap, + recompute_mixture=False, + ) + + def get_stem(self, *, stem: str, identifier) -> torch.Tensor: + track = identifier["track"] + path = os.path.join(self.data_path, track) + # noinspection PyUnresolvedReferences + + if self.npy_memmap: + audio = np.load(os.path.join(path, f"{stem}.wav.npy"), mmap_mode="r") + else: + audio, _ = ta.load(os.path.join(path, f"{stem}.wav")) + + return audio + + def get_identifier(self, index): + return dict(track=self.files[index]) + + def __getitem__(self, index: int) -> DataDict: + identifier = self.get_identifier(index) + audio = self.get_audio(identifier) + + return {"audio": audio, "track": f"{self.split}/{identifier['track']}"} + + +class MUSDB18FullTrackDataset(MUSDB18BaseDataset): + + N_TRAIN_TRACKS = 100 + N_TEST_TRACKS = 50 + VALIDATION_FILES = [ + "Actions - One Minute Smile", + "Clara Berry And Wooldog - Waltz For My Victims", + "Johnny Lokke - Promises & Lies", + "Patrick Talbot - A Reason To Leave", + "Triviul - Angelsaint", + "Alexander Ross - Goodbye Bolero", + "Fergessen - Nos Palpitants", + "Leaf - Summerghost", + "Skelpolu - Human Mistakes", + "Young Griffo - Pennies", + "ANiMAL - Rockshow", + "James May - On The Line", + "Meaxic - Take A Step", + "Traffic Experiment - Sirens", + ] + + def __init__( + self, data_root: str, split: str, stems: Optional[List[str]] = None + ) -> None: + + if stems is None: + stems = self.ALLOWED_STEMS + self.stems = stems + + if split == "test": + subset = "test" + elif split in ["train", "val"]: + subset = "train" + else: + raise NameError + + data_path = os.path.join(data_root, subset) + + files = sorted(os.listdir(data_path)) + files = [f for f in files if not f.startswith(".")] + # pprint(list(enumerate(files))) + if subset == "train": + assert len(files) == 100, len(files) + if split == "train": + files = [f for f in files if f not in self.VALIDATION_FILES] + assert len(files) == 100 - len(self.VALIDATION_FILES) + else: + files = [f for f in files if f in self.VALIDATION_FILES] + assert len(files) == len(self.VALIDATION_FILES) + else: + split = "test" + assert len(files) == 50 + + self.n_tracks = len(files) + + super().__init__(data_path=data_path, split=split, stems=stems, files=files) + + def __len__(self) -> int: + return self.n_tracks + + +class MUSDB18SadDataset(MUSDB18BaseDataset): + def __init__( + self, + data_root: str, + split: str, + target_stem: str, + stems: Optional[List[str]] = None, + target_length: Optional[int] = None, + npy_memmap=False, + ) -> None: + + if stems is None: + stems = self.ALLOWED_STEMS + + data_path = os.path.join(data_root, target_stem, split) + + files = sorted(os.listdir(data_path)) + files = [f for f in files if not f.startswith(".")] + + super().__init__( + data_path=data_path, + split=split, + stems=stems, + files=files, + npy_memmap=npy_memmap, + ) + self.n_segments = len(files) + self.target_stem = target_stem + self.target_length = ( + target_length if target_length is not None else self.n_segments + ) + + def __len__(self) -> int: + return self.target_length + + def __getitem__(self, index: int) -> DataDict: + + index = index % self.n_segments + + return super().__getitem__(index) + + def get_identifier(self, index): + return super().get_identifier(index % self.n_segments) + + +class MUSDB18SadOnTheFlyAugmentedDataset(MUSDB18SadDataset): + def __init__( + self, + data_root: str, + split: str, + target_stem: str, + stems: Optional[List[str]] = None, + target_length: int = 20000, + apply_probability: Optional[float] = None, + chunk_size_second: float = 3.0, + random_scale_range_db: Tuple[float, float] = (-10, 10), + drop_probability: float = 0.1, + rescale: bool = True, + ) -> None: + super().__init__(data_root, split, target_stem, stems) + + if apply_probability is None: + apply_probability = (target_length - self.n_segments) / target_length + + self.apply_probability = apply_probability + self.drop_probability = drop_probability + self.chunk_size_second = chunk_size_second + self.random_scale_range_db = random_scale_range_db + self.rescale = rescale + + self.chunk_size_sample = int(self.chunk_size_second * self.fs) + self.target_length = target_length + + def __len__(self) -> int: + return self.target_length + + def __getitem__(self, index: int) -> DataDict: + + index = index % self.n_segments + + # if np.random.rand() > self.apply_probability: + # return super().__getitem__(index) + + audio = {} + identifier = self.get_identifier(index) + + # assert self.target_stem in self.stems_no_mixture + for stem in self.stems_no_mixture: + if stem == self.target_stem: + identifier_ = identifier + else: + if np.random.rand() < self.apply_probability: + index_ = np.random.randint(self.n_segments) + identifier_ = self.get_identifier(index_) + else: + identifier_ = identifier + + audio[stem] = self.get_stem(stem=stem, identifier=identifier_) + + # if stem == self.target_stem: + + if self.chunk_size_sample < audio[stem].shape[-1]: + chunk_start = np.random.randint( + audio[stem].shape[-1] - self.chunk_size_sample + ) + else: + chunk_start = 0 + + if np.random.rand() < self.drop_probability: + # db_scale = "-inf" + linear_scale = 0.0 + else: + db_scale = np.random.uniform(*self.random_scale_range_db) + linear_scale = np.power(10, db_scale / 20) + # db_scale = f"{db_scale:+2.1f}" + # print(linear_scale) + audio[stem][..., chunk_start : chunk_start + self.chunk_size_sample] = ( + linear_scale + * audio[stem][..., chunk_start : chunk_start + self.chunk_size_sample] + ) + + audio["mixture"] = self.compute_mixture(audio) + + if self.rescale: + max_abs_val = max( + [torch.max(torch.abs(audio[stem])) for stem in self.stems] + ) # type: ignore[type-var] + if max_abs_val > 1: + audio = {k: v / max_abs_val for k, v in audio.items()} + + track = identifier["track"] + + return {"audio": audio, "track": f"{self.split}/{track}"} + + +# if __name__ == "__main__": +# +# from pprint import pprint +# from tqdm import tqdm +# +# for split_ in ["train", "val", "test"]: +# ds = MUSDB18SadOnTheFlyAugmentedDataset( +# data_root="$DATA_ROOT/MUSDB18/HQ/saded", +# split=split_, +# target_stem="vocals" +# ) +# +# print(split_, len(ds)) +# +# for track_ in tqdm(ds): +# track_["audio"] = { +# k: v.shape for k, v in track_["audio"].items() +# } +# pprint(track_) diff --git a/programs/music_separation_code/models/bandit/core/data/musdb/preprocess.py b/programs/music_separation_code/models/bandit/core/data/musdb/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..bbc02b14b89b1b48bc8fa38f1f06d1944e7616e6 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/musdb/preprocess.py @@ -0,0 +1,226 @@ +import glob +import os + +import numpy as np +import torch +import torchaudio as ta +from torch import nn +from torch.nn import functional as F +from tqdm.contrib.concurrent import process_map + +from core.data._types import DataDict +from core.data.musdb.dataset import MUSDB18FullTrackDataset +import pyloudnorm as pyln + + +class SourceActivityDetector(nn.Module): + def __init__( + self, + analysis_stem: str, + output_path: str, + fs: int = 44100, + segment_length_second: float = 6.0, + hop_length_second: float = 3.0, + n_chunks: int = 10, + chunk_epsilon: float = 1e-5, + energy_threshold_quantile: float = 0.15, + segment_epsilon: float = 1e-3, + salient_proportion_threshold: float = 0.5, + target_lufs: float = -24, + ) -> None: + super().__init__() + + self.fs = fs + self.segment_length = int(segment_length_second * self.fs) + self.hop_length = int(hop_length_second * self.fs) + self.n_chunks = n_chunks + assert self.segment_length % self.n_chunks == 0 + self.chunk_size = self.segment_length // self.n_chunks + self.chunk_epsilon = chunk_epsilon + self.energy_threshold_quantile = energy_threshold_quantile + self.segment_epsilon = segment_epsilon + self.salient_proportion_threshold = salient_proportion_threshold + self.analysis_stem = analysis_stem + + self.meter = pyln.Meter(self.fs) + self.target_lufs = target_lufs + + self.output_path = output_path + + def forward(self, data: DataDict) -> None: + + stem_ = self.analysis_stem if (self.analysis_stem != "none") else "mixture" + + x = data["audio"][stem_] + + xnp = x.numpy() + loudness = self.meter.integrated_loudness(xnp.T) + + for stem in data["audio"]: + s = data["audio"][stem] + s = pyln.normalize.loudness(s.numpy().T, loudness, self.target_lufs).T + s = torch.as_tensor(s) + data["audio"][stem] = s + + if x.ndim == 3: + assert x.shape[0] == 1 + x = x[0] + + n_chan, n_samples = x.shape + + n_segments = ( + int(np.ceil((n_samples - self.segment_length) / self.hop_length)) + 1 + ) + + segments = torch.zeros((n_segments, n_chan, self.segment_length)) + for i in range(n_segments): + start = i * self.hop_length + end = start + self.segment_length + end = min(end, n_samples) + + xseg = x[:, start:end] + + if end - start < self.segment_length: + xseg = F.pad( + xseg, pad=(0, self.segment_length - (end - start)), value=torch.nan + ) + + segments[i, :, :] = xseg + + chunks = segments.reshape((n_segments, n_chan, self.n_chunks, self.chunk_size)) + + if self.analysis_stem != "none": + chunk_energies = torch.mean(torch.square(chunks), dim=(1, 3)) + chunk_energies = torch.nan_to_num(chunk_energies, nan=0) + chunk_energies[chunk_energies == 0] = self.chunk_epsilon + + energy_threshold = torch.nanquantile( + chunk_energies, q=self.energy_threshold_quantile + ) + + if energy_threshold < self.segment_epsilon: + energy_threshold = self.segment_epsilon # type: ignore[assignment] + + chunks_above_threshold = chunk_energies > energy_threshold + n_chunks_above_threshold = torch.mean( + chunks_above_threshold.to(torch.float), dim=-1 + ) + + segment_above_threshold = ( + n_chunks_above_threshold > self.salient_proportion_threshold + ) + + if torch.sum(segment_above_threshold) == 0: + return + + else: + segment_above_threshold = torch.ones((n_segments,)) + + for i in range(n_segments): + if not segment_above_threshold[i]: + continue + + outpath = os.path.join( + self.output_path, + self.analysis_stem, + f"{data['track']} - {self.analysis_stem}{i:03d}", + ) + os.makedirs(outpath, exist_ok=True) + + for stem in data["audio"]: + if stem == self.analysis_stem: + segment = torch.nan_to_num(segments[i, :, :], nan=0) + else: + start = i * self.hop_length + end = start + self.segment_length + end = min(n_samples, end) + + segment = data["audio"][stem][:, start:end] + + if end - start < self.segment_length: + segment = F.pad( + segment, (0, self.segment_length - (end - start)) + ) + + assert segment.shape[-1] == self.segment_length, segment.shape + + # ta.save(os.path.join(outpath, f"{stem}.wav"), segment, self.fs) + + np.save(os.path.join(outpath, f"{stem}.wav"), segment) + + +def preprocess( + analysis_stem: str, + output_path: str = "/data/MUSDB18/HQ/saded-np", + fs: int = 44100, + segment_length_second: float = 6.0, + hop_length_second: float = 3.0, + n_chunks: int = 10, + chunk_epsilon: float = 1e-5, + energy_threshold_quantile: float = 0.15, + segment_epsilon: float = 1e-3, + salient_proportion_threshold: float = 0.5, +) -> None: + + sad = SourceActivityDetector( + analysis_stem=analysis_stem, + output_path=output_path, + fs=fs, + segment_length_second=segment_length_second, + hop_length_second=hop_length_second, + n_chunks=n_chunks, + chunk_epsilon=chunk_epsilon, + energy_threshold_quantile=energy_threshold_quantile, + segment_epsilon=segment_epsilon, + salient_proportion_threshold=salient_proportion_threshold, + ) + + for split in ["train", "val", "test"]: + ds = MUSDB18FullTrackDataset( + data_root="/data/MUSDB18/HQ/canonical", + split=split, + ) + + tracks = [] + for i, track in enumerate(tqdm(ds, total=len(ds))): + if i % 32 == 0 and tracks: + process_map(sad, tracks, max_workers=8) + tracks = [] + tracks.append(track) + process_map(sad, tracks, max_workers=8) + + +def loudness_norm_one(inputs): + infile, outfile, target_lufs = inputs + + audio, fs = ta.load(infile) + audio = audio.mean(dim=0, keepdim=True).numpy().T + + meter = pyln.Meter(fs) + loudness = meter.integrated_loudness(audio) + audio = pyln.normalize.loudness(audio, loudness, target_lufs) + + os.makedirs(os.path.dirname(outfile), exist_ok=True) + np.save(outfile, audio.T) + + +def loudness_norm( + data_path: str, + # output_path: str, + target_lufs=-17.0, +): + files = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True) + + outfiles = [f.replace(".wav", ".npy").replace("saded", "saded-np") for f in files] + + files = [(f, o, target_lufs) for f, o in zip(files, outfiles)] + + process_map(loudness_norm_one, files, chunksize=2) + + +if __name__ == "__main__": + + from tqdm import tqdm + import fire + + fire.Fire() diff --git a/programs/music_separation_code/models/bandit/core/data/musdb/validation.yaml b/programs/music_separation_code/models/bandit/core/data/musdb/validation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f8752478d285d1d13d5e842225af1de95cae57a --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/data/musdb/validation.yaml @@ -0,0 +1,15 @@ +validation: + - 'Actions - One Minute Smile' + - 'Clara Berry And Wooldog - Waltz For My Victims' + - 'Johnny Lokke - Promises & Lies' + - 'Patrick Talbot - A Reason To Leave' + - 'Triviul - Angelsaint' + - 'Alexander Ross - Goodbye Bolero' + - 'Fergessen - Nos Palpitants' + - 'Leaf - Summerghost' + - 'Skelpolu - Human Mistakes' + - 'Young Griffo - Pennies' + - 'ANiMAL - Rockshow' + - 'James May - On The Line' + - 'Meaxic - Take A Step' + - 'Traffic Experiment - Sirens' \ No newline at end of file diff --git a/programs/music_separation_code/models/bandit/core/loss/__init__.py b/programs/music_separation_code/models/bandit/core/loss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..993be521fa7ab8f06a2a012beabdb9fdd6cd0a80 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/loss/__init__.py @@ -0,0 +1,8 @@ +from ._multistem import MultiStemWrapperFromConfig +from ._timefreq import ( + ReImL1Loss, + ReImL2Loss, + TimeFreqL1Loss, + TimeFreqL2Loss, + TimeFreqSignalNoisePNormRatioLoss, +) diff --git a/programs/music_separation_code/models/bandit/core/loss/_complex.py b/programs/music_separation_code/models/bandit/core/loss/_complex.py new file mode 100644 index 0000000000000000000000000000000000000000..68c82f204709d07cba013f1582ca985bcf66dde6 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/loss/_complex.py @@ -0,0 +1,27 @@ +from typing import Any + +import torch +from torch import nn +from torch.nn.modules import loss as _loss +from torch.nn.modules.loss import _Loss + + +class ReImLossWrapper(_Loss): + def __init__(self, module: _Loss) -> None: + super().__init__() + self.module = module + + def forward(self, preds: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + return self.module(torch.view_as_real(preds), torch.view_as_real(target)) + + +class ReImL1Loss(ReImLossWrapper): + def __init__(self, **kwargs: Any) -> None: + l1_loss = _loss.L1Loss(**kwargs) + super().__init__(module=(l1_loss)) + + +class ReImL2Loss(ReImLossWrapper): + def __init__(self, **kwargs: Any) -> None: + l2_loss = _loss.MSELoss(**kwargs) + super().__init__(module=(l2_loss)) diff --git a/programs/music_separation_code/models/bandit/core/loss/_multistem.py b/programs/music_separation_code/models/bandit/core/loss/_multistem.py new file mode 100644 index 0000000000000000000000000000000000000000..e9c4a4f776a318b4450cc98c820a057983e2f9a3 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/loss/_multistem.py @@ -0,0 +1,43 @@ +from typing import Any, Dict + +import torch +from asteroid import losses as asteroid_losses +from torch import nn +from torch.nn.modules.loss import _Loss + +from . import snr + + +def parse_loss(name: str, kwargs: Dict[str, Any]) -> _Loss: + + for module in [nn.modules.loss, snr, asteroid_losses, asteroid_losses.sdr]: + if name in module.__dict__: + return module.__dict__[name](**kwargs) + + raise NameError + + +class MultiStemWrapper(_Loss): + def __init__(self, module: _Loss, modality: str = "audio") -> None: + super().__init__() + self.loss = module + self.modality = modality + + def forward( + self, + preds: Dict[str, Dict[str, torch.Tensor]], + target: Dict[str, Dict[str, torch.Tensor]], + ) -> torch.Tensor: + loss = { + stem: self.loss(preds[self.modality][stem], target[self.modality][stem]) + for stem in preds[self.modality] + if stem in target[self.modality] + } + + return sum(list(loss.values())) + + +class MultiStemWrapperFromConfig(MultiStemWrapper): + def __init__(self, name: str, kwargs: Any, modality: str = "audio") -> None: + loss = parse_loss(name, kwargs) + super().__init__(module=loss, modality=modality) diff --git a/programs/music_separation_code/models/bandit/core/loss/_timefreq.py b/programs/music_separation_code/models/bandit/core/loss/_timefreq.py new file mode 100644 index 0000000000000000000000000000000000000000..96080e85b617d74f839a176ba07f5d5150d28822 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/loss/_timefreq.py @@ -0,0 +1,95 @@ +from typing import Any, Dict, Optional + +import torch +from torch import nn +from torch.nn.modules.loss import _Loss + +from models.bandit.core.loss._multistem import MultiStemWrapper +from models.bandit.core.loss._complex import ReImL1Loss, ReImL2Loss, ReImLossWrapper +from models.bandit.core.loss.snr import SignalNoisePNormRatio + + +class TimeFreqWrapper(_Loss): + def __init__( + self, + time_module: _Loss, + freq_module: Optional[_Loss] = None, + time_weight: float = 1.0, + freq_weight: float = 1.0, + multistem: bool = True, + ) -> None: + super().__init__() + + if freq_module is None: + freq_module = time_module + + if multistem: + time_module = MultiStemWrapper(time_module, modality="audio") + freq_module = MultiStemWrapper(freq_module, modality="spectrogram") + + self.time_module = time_module + self.freq_module = freq_module + + self.time_weight = time_weight + self.freq_weight = freq_weight + + # TODO: add better type hints + def forward(self, preds: Any, target: Any) -> torch.Tensor: + + return self.time_weight * self.time_module( + preds, target + ) + self.freq_weight * self.freq_module(preds, target) + + +class TimeFreqL1Loss(TimeFreqWrapper): + def __init__( + self, + time_weight: float = 1.0, + freq_weight: float = 1.0, + tkwargs: Optional[Dict[str, Any]] = None, + fkwargs: Optional[Dict[str, Any]] = None, + multistem: bool = True, + ) -> None: + if tkwargs is None: + tkwargs = {} + if fkwargs is None: + fkwargs = {} + time_module = nn.L1Loss(**tkwargs) + freq_module = ReImL1Loss(**fkwargs) + super().__init__(time_module, freq_module, time_weight, freq_weight, multistem) + + +class TimeFreqL2Loss(TimeFreqWrapper): + def __init__( + self, + time_weight: float = 1.0, + freq_weight: float = 1.0, + tkwargs: Optional[Dict[str, Any]] = None, + fkwargs: Optional[Dict[str, Any]] = None, + multistem: bool = True, + ) -> None: + if tkwargs is None: + tkwargs = {} + if fkwargs is None: + fkwargs = {} + time_module = nn.MSELoss(**tkwargs) + freq_module = ReImL2Loss(**fkwargs) + super().__init__(time_module, freq_module, time_weight, freq_weight, multistem) + + +class TimeFreqSignalNoisePNormRatioLoss(TimeFreqWrapper): + def __init__( + self, + time_weight: float = 1.0, + freq_weight: float = 1.0, + tkwargs: Optional[Dict[str, Any]] = None, + fkwargs: Optional[Dict[str, Any]] = None, + multistem: bool = True, + ) -> None: + if tkwargs is None: + tkwargs = {} + if fkwargs is None: + fkwargs = {} + time_module = SignalNoisePNormRatio(**tkwargs) + freq_module = SignalNoisePNormRatio(**fkwargs) + super().__init__(time_module, freq_module, time_weight, freq_weight, multistem) diff --git a/programs/music_separation_code/models/bandit/core/loss/snr.py b/programs/music_separation_code/models/bandit/core/loss/snr.py new file mode 100644 index 0000000000000000000000000000000000000000..8d712a525027417198c7072ef571166ef0c02afa --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/loss/snr.py @@ -0,0 +1,139 @@ +import torch +from torch.nn.modules.loss import _Loss +from torch.nn import functional as F + + +class SignalNoisePNormRatio(_Loss): + def __init__( + self, + p: float = 1.0, + scale_invariant: bool = False, + zero_mean: bool = False, + take_log: bool = True, + reduction: str = "mean", + EPS: float = 1e-3, + ) -> None: + assert reduction != "sum", NotImplementedError + super().__init__(reduction=reduction) + assert not zero_mean + + self.p = p + + self.EPS = EPS + self.take_log = take_log + + self.scale_invariant = scale_invariant + + def forward(self, est_target: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + + target_ = target + if self.scale_invariant: + ndim = target.ndim + dot = torch.sum(est_target * torch.conj(target), dim=-1, keepdim=True) + s_target_energy = torch.sum( + target * torch.conj(target), dim=-1, keepdim=True + ) + + if ndim > 2: + dot = torch.sum(dot, dim=list(range(1, ndim)), keepdim=True) + s_target_energy = torch.sum( + s_target_energy, dim=list(range(1, ndim)), keepdim=True + ) + + target_scaler = (dot + 1e-8) / (s_target_energy + 1e-8) + target = target_ * target_scaler + + if torch.is_complex(est_target): + est_target = torch.view_as_real(est_target) + target = torch.view_as_real(target) + + batch_size = est_target.shape[0] + est_target = est_target.reshape(batch_size, -1) + target = target.reshape(batch_size, -1) + # target_ = target_.reshape(batch_size, -1) + + if self.p == 1: + e_error = torch.abs(est_target - target).mean(dim=-1) + e_target = torch.abs(target).mean(dim=-1) + elif self.p == 2: + e_error = torch.square(est_target - target).mean(dim=-1) + e_target = torch.square(target).mean(dim=-1) + else: + raise NotImplementedError + + if self.take_log: + loss = 10 * ( + torch.log10(e_error + self.EPS) - torch.log10(e_target + self.EPS) + ) + else: + loss = (e_error + self.EPS) / (e_target + self.EPS) + + if self.reduction == "mean": + loss = loss.mean() + elif self.reduction == "sum": + loss = loss.sum() + + return loss + + +class MultichannelSingleSrcNegSDR(_Loss): + def __init__( + self, + sdr_type: str, + p: float = 2.0, + zero_mean: bool = True, + take_log: bool = True, + reduction: str = "mean", + EPS: float = 1e-8, + ) -> None: + assert reduction != "sum", NotImplementedError + super().__init__(reduction=reduction) + + assert sdr_type in ["snr", "sisdr", "sdsdr"] + self.sdr_type = sdr_type + self.zero_mean = zero_mean + self.take_log = take_log + self.EPS = 1e-8 + + self.p = p + + def forward(self, est_target: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + if target.size() != est_target.size() or target.ndim != 3: + raise TypeError( + f"Inputs must be of shape [batch, time], got {target.size()} and {est_target.size()} instead" + ) + # Step 1. Zero-mean norm + if self.zero_mean: + mean_source = torch.mean(target, dim=[1, 2], keepdim=True) + mean_estimate = torch.mean(est_target, dim=[1, 2], keepdim=True) + target = target - mean_source + est_target = est_target - mean_estimate + # Step 2. Pair-wise SI-SDR. + if self.sdr_type in ["sisdr", "sdsdr"]: + # [batch, 1] + dot = torch.sum(est_target * target, dim=[1, 2], keepdim=True) + # [batch, 1] + s_target_energy = torch.sum(target**2, dim=[1, 2], keepdim=True) + self.EPS + # [batch, time] + scaled_target = dot * target / s_target_energy + else: + # [batch, time] + scaled_target = target + if self.sdr_type in ["sdsdr", "snr"]: + e_noise = est_target - target + else: + e_noise = est_target - scaled_target + # [batch] + + if self.p == 2.0: + losses = torch.sum(scaled_target**2, dim=[1, 2]) / ( + torch.sum(e_noise**2, dim=[1, 2]) + self.EPS + ) + else: + losses = torch.norm(scaled_target, p=self.p, dim=[1, 2]) / ( + torch.linalg.vector_norm(e_noise, p=self.p, dim=[1, 2]) + self.EPS + ) + if self.take_log: + losses = 10 * torch.log10(losses + self.EPS) + losses = losses.mean() if self.reduction == "mean" else losses + return -losses diff --git a/programs/music_separation_code/models/bandit/core/metrics/__init__.py b/programs/music_separation_code/models/bandit/core/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c638b4df585ad6c3c6490d9e67b7fc197f0d06f4 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/metrics/__init__.py @@ -0,0 +1,9 @@ +from .snr import ( + ChunkMedianScaleInvariantSignalDistortionRatio, + ChunkMedianScaleInvariantSignalNoiseRatio, + ChunkMedianSignalDistortionRatio, + ChunkMedianSignalNoiseRatio, + SafeSignalDistortionRatio, +) + +# from .mushra import EstimatedMushraScore diff --git a/programs/music_separation_code/models/bandit/core/metrics/_squim.py b/programs/music_separation_code/models/bandit/core/metrics/_squim.py new file mode 100644 index 0000000000000000000000000000000000000000..71c993a2b6cb3da36849c2f87ef7bb7443a9095c --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/metrics/_squim.py @@ -0,0 +1,443 @@ +from dataclasses import dataclass + +from torchaudio._internal import load_state_dict_from_url + +import math +from typing import List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def transform_wb_pesq_range(x: float) -> float: + """The metric defined by ITU-T P.862 is often called 'PESQ score', which is defined + for narrow-band signals and has a value range of [-0.5, 4.5] exactly. Here, we use the metric + defined by ITU-T P.862.2, commonly known as 'wide-band PESQ' and will be referred to as "PESQ score". + + Args: + x (float): Narrow-band PESQ score. + + Returns: + (float): Wide-band PESQ score. + """ + return 0.999 + (4.999 - 0.999) / (1 + math.exp(-1.3669 * x + 3.8224)) + + +PESQRange: Tuple[float, float] = ( + 1.0, # P.862.2 uses a different input filter than P.862, and the lower bound of + # the raw score is not -0.5 anymore. It's hard to figure out the true lower bound. + # We are using 1.0 as a reasonable approximation. + transform_wb_pesq_range(4.5), +) + + +class RangeSigmoid(nn.Module): + def __init__(self, val_range: Tuple[float, float] = (0.0, 1.0)) -> None: + super(RangeSigmoid, self).__init__() + assert isinstance(val_range, tuple) and len(val_range) == 2 + self.val_range: Tuple[float, float] = val_range + self.sigmoid: nn.modules.Module = nn.Sigmoid() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = ( + self.sigmoid(x) * (self.val_range[1] - self.val_range[0]) + + self.val_range[0] + ) + return out + + +class Encoder(nn.Module): + """Encoder module that transform 1D waveform to 2D representations. + + Args: + feat_dim (int, optional): The feature dimension after Encoder module. (Default: 512) + win_len (int, optional): kernel size in the Conv1D layer. (Default: 32) + """ + + def __init__(self, feat_dim: int = 512, win_len: int = 32) -> None: + super(Encoder, self).__init__() + + self.conv1d = nn.Conv1d(1, feat_dim, win_len, stride=win_len // 2, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Apply waveforms to convolutional layer and ReLU layer. + + Args: + x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`. + + Returns: + (torch,Tensor): Feature Tensor with dimensions `(batch, channel, frame)`. + """ + out = x.unsqueeze(dim=1) + out = F.relu(self.conv1d(out)) + return out + + +class SingleRNN(nn.Module): + def __init__( + self, rnn_type: str, input_size: int, hidden_size: int, dropout: float = 0.0 + ) -> None: + super(SingleRNN, self).__init__() + + self.rnn_type = rnn_type + self.input_size = input_size + self.hidden_size = hidden_size + + self.rnn: nn.modules.Module = getattr(nn, rnn_type)( + input_size, + hidden_size, + 1, + dropout=dropout, + batch_first=True, + bidirectional=True, + ) + + self.proj = nn.Linear(hidden_size * 2, input_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # input shape: batch, seq, dim + out, _ = self.rnn(x) + out = self.proj(out) + return out + + +class DPRNN(nn.Module): + """*Dual-path recurrent neural networks (DPRNN)* :cite:`luo2020dual`. + + Args: + feat_dim (int, optional): The feature dimension after Encoder module. (Default: 64) + hidden_dim (int, optional): Hidden dimension in the RNN layer of DPRNN. (Default: 128) + num_blocks (int, optional): Number of DPRNN layers. (Default: 6) + rnn_type (str, optional): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. (Default: "LSTM") + d_model (int, optional): The number of expected features in the input. (Default: 256) + chunk_size (int, optional): Chunk size of input for DPRNN. (Default: 100) + chunk_stride (int, optional): Stride of chunk input for DPRNN. (Default: 50) + """ + + def __init__( + self, + feat_dim: int = 64, + hidden_dim: int = 128, + num_blocks: int = 6, + rnn_type: str = "LSTM", + d_model: int = 256, + chunk_size: int = 100, + chunk_stride: int = 50, + ) -> None: + super(DPRNN, self).__init__() + + self.num_blocks = num_blocks + + self.row_rnn = nn.ModuleList([]) + self.col_rnn = nn.ModuleList([]) + self.row_norm = nn.ModuleList([]) + self.col_norm = nn.ModuleList([]) + for _ in range(num_blocks): + self.row_rnn.append(SingleRNN(rnn_type, feat_dim, hidden_dim)) + self.col_rnn.append(SingleRNN(rnn_type, feat_dim, hidden_dim)) + self.row_norm.append(nn.GroupNorm(1, feat_dim, eps=1e-8)) + self.col_norm.append(nn.GroupNorm(1, feat_dim, eps=1e-8)) + self.conv = nn.Sequential( + nn.Conv2d(feat_dim, d_model, 1), + nn.PReLU(), + ) + self.chunk_size = chunk_size + self.chunk_stride = chunk_stride + + def pad_chunk(self, x: torch.Tensor) -> Tuple[torch.Tensor, int]: + # input shape: (B, N, T) + seq_len = x.shape[-1] + + rest = ( + self.chunk_size + - (self.chunk_stride + seq_len % self.chunk_size) % self.chunk_size + ) + out = F.pad(x, [self.chunk_stride, rest + self.chunk_stride]) + + return out, rest + + def chunking(self, x: torch.Tensor) -> Tuple[torch.Tensor, int]: + out, rest = self.pad_chunk(x) + batch_size, feat_dim, seq_len = out.shape + + segments1 = ( + out[:, :, : -self.chunk_stride] + .contiguous() + .view(batch_size, feat_dim, -1, self.chunk_size) + ) + segments2 = ( + out[:, :, self.chunk_stride :] + .contiguous() + .view(batch_size, feat_dim, -1, self.chunk_size) + ) + out = torch.cat([segments1, segments2], dim=3) + out = ( + out.view(batch_size, feat_dim, -1, self.chunk_size) + .transpose(2, 3) + .contiguous() + ) + + return out, rest + + def merging(self, x: torch.Tensor, rest: int) -> torch.Tensor: + batch_size, dim, _, _ = x.shape + out = ( + x.transpose(2, 3) + .contiguous() + .view(batch_size, dim, -1, self.chunk_size * 2) + ) + out1 = ( + out[:, :, :, : self.chunk_size] + .contiguous() + .view(batch_size, dim, -1)[:, :, self.chunk_stride :] + ) + out2 = ( + out[:, :, :, self.chunk_size :] + .contiguous() + .view(batch_size, dim, -1)[:, :, : -self.chunk_stride] + ) + out = out1 + out2 + if rest > 0: + out = out[:, :, :-rest] + out = out.contiguous() + return out + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, rest = self.chunking(x) + batch_size, _, dim1, dim2 = x.shape + out = x + for row_rnn, row_norm, col_rnn, col_norm in zip( + self.row_rnn, self.row_norm, self.col_rnn, self.col_norm + ): + row_in = ( + out.permute(0, 3, 2, 1) + .contiguous() + .view(batch_size * dim2, dim1, -1) + .contiguous() + ) + row_out = row_rnn(row_in) + row_out = ( + row_out.view(batch_size, dim2, dim1, -1) + .permute(0, 3, 2, 1) + .contiguous() + ) + row_out = row_norm(row_out) + out = out + row_out + + col_in = ( + out.permute(0, 2, 3, 1) + .contiguous() + .view(batch_size * dim1, dim2, -1) + .contiguous() + ) + col_out = col_rnn(col_in) + col_out = ( + col_out.view(batch_size, dim1, dim2, -1) + .permute(0, 3, 1, 2) + .contiguous() + ) + col_out = col_norm(col_out) + out = out + col_out + out = self.conv(out) + out = self.merging(out, rest) + out = out.transpose(1, 2).contiguous() + return out + + +class AutoPool(nn.Module): + def __init__(self, pool_dim: int = 1) -> None: + super(AutoPool, self).__init__() + self.pool_dim: int = pool_dim + self.softmax: nn.modules.Module = nn.Softmax(dim=pool_dim) + self.register_parameter("alpha", nn.Parameter(torch.ones(1))) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + weight = self.softmax(torch.mul(x, self.alpha)) + out = torch.sum(torch.mul(x, weight), dim=self.pool_dim) + return out + + +class SquimObjective(nn.Module): + """Speech Quality and Intelligibility Measures (SQUIM) model that predicts **objective** metric scores + for speech enhancement (e.g., STOI, PESQ, and SI-SDR). + + Args: + encoder (torch.nn.Module): Encoder module to transform 1D waveform to 2D feature representation. + dprnn (torch.nn.Module): DPRNN module to model sequential feature. + branches (torch.nn.ModuleList): Transformer branches in which each branch estimate one objective metirc score. + """ + + def __init__( + self, + encoder: nn.Module, + dprnn: nn.Module, + branches: nn.ModuleList, + ): + super(SquimObjective, self).__init__() + self.encoder = encoder + self.dprnn = dprnn + self.branches = branches + + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + """ + Args: + x (torch.Tensor): Input waveforms. Tensor with dimensions `(batch, time)`. + + Returns: + List(torch.Tensor): List of score Tenosrs. Each Tensor is with dimension `(batch,)`. + """ + if x.ndim != 2: + raise ValueError( + f"The input must be a 2D Tensor. Found dimension {x.ndim}." + ) + x = x / (torch.mean(x**2, dim=1, keepdim=True) ** 0.5 * 20) + out = self.encoder(x) + out = self.dprnn(out) + scores = [] + for branch in self.branches: + scores.append(branch(out).squeeze(dim=1)) + return scores + + +def _create_branch(d_model: int, nhead: int, metric: str) -> nn.modules.Module: + """Create branch module after DPRNN model for predicting metric score. + + Args: + d_model (int): The number of expected features in the input. + nhead (int): Number of heads in the multi-head attention model. + metric (str): The metric name to predict. + + Returns: + (nn.Module): Returned module to predict corresponding metric score. + """ + layer1 = nn.TransformerEncoderLayer( + d_model, nhead, d_model * 4, dropout=0.0, batch_first=True + ) + layer2 = AutoPool() + if metric == "stoi": + layer3 = nn.Sequential( + nn.Linear(d_model, d_model), + nn.PReLU(), + nn.Linear(d_model, 1), + RangeSigmoid(), + ) + elif metric == "pesq": + layer3 = nn.Sequential( + nn.Linear(d_model, d_model), + nn.PReLU(), + nn.Linear(d_model, 1), + RangeSigmoid(val_range=PESQRange), + ) + else: + layer3: nn.modules.Module = nn.Sequential( + nn.Linear(d_model, d_model), nn.PReLU(), nn.Linear(d_model, 1) + ) + return nn.Sequential(layer1, layer2, layer3) + + +def squim_objective_model( + feat_dim: int, + win_len: int, + d_model: int, + nhead: int, + hidden_dim: int, + num_blocks: int, + rnn_type: str, + chunk_size: int, + chunk_stride: Optional[int] = None, +) -> SquimObjective: + """Build a custome :class:`torchaudio.prototype.models.SquimObjective` model. + + Args: + feat_dim (int, optional): The feature dimension after Encoder module. + win_len (int): Kernel size in the Encoder module. + d_model (int): The number of expected features in the input. + nhead (int): Number of heads in the multi-head attention model. + hidden_dim (int): Hidden dimension in the RNN layer of DPRNN. + num_blocks (int): Number of DPRNN layers. + rnn_type (str): Type of RNN in DPRNN. Valid options are ["RNN", "LSTM", "GRU"]. + chunk_size (int): Chunk size of input for DPRNN. + chunk_stride (int or None, optional): Stride of chunk input for DPRNN. + """ + if chunk_stride is None: + chunk_stride = chunk_size // 2 + encoder = Encoder(feat_dim, win_len) + dprnn = DPRNN( + feat_dim, hidden_dim, num_blocks, rnn_type, d_model, chunk_size, chunk_stride + ) + branches = nn.ModuleList( + [ + _create_branch(d_model, nhead, "stoi"), + _create_branch(d_model, nhead, "pesq"), + _create_branch(d_model, nhead, "sisdr"), + ] + ) + return SquimObjective(encoder, dprnn, branches) + + +def squim_objective_base() -> SquimObjective: + """Build :class:`torchaudio.prototype.models.SquimObjective` model with default arguments.""" + return squim_objective_model( + feat_dim=256, + win_len=64, + d_model=256, + nhead=4, + hidden_dim=256, + num_blocks=2, + rnn_type="LSTM", + chunk_size=71, + ) + + +@dataclass +class SquimObjectiveBundle: + + _path: str + _sample_rate: float + + def _get_state_dict(self, dl_kwargs): + url = f"https://download.pytorch.org/torchaudio/models/{self._path}" + dl_kwargs = {} if dl_kwargs is None else dl_kwargs + state_dict = load_state_dict_from_url(url, **dl_kwargs) + return state_dict + + def get_model(self, *, dl_kwargs=None) -> SquimObjective: + """Construct the SquimObjective model, and load the pretrained weight. + + The weight file is downloaded from the internet and cached with + :func:`torch.hub.load_state_dict_from_url` + + Args: + dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`. + + Returns: + Variation of :py:class:`~torchaudio.models.SquimObjective`. + """ + model = squim_objective_base() + model.load_state_dict(self._get_state_dict(dl_kwargs)) + model.eval() + return model + + @property + def sample_rate(self): + """Sample rate of the audio that the model is trained on. + + :type: float + """ + return self._sample_rate + + +SQUIM_OBJECTIVE = SquimObjectiveBundle( + "squim_objective_dns2020.pth", + _sample_rate=16000, +) +SQUIM_OBJECTIVE.__doc__ = """SquimObjective pipeline trained using approach described in + :cite:`kumar2023torchaudio` on the *DNS 2020 Dataset* :cite:`reddy2020interspeech`. + + The underlying model is constructed by :py:func:`torchaudio.models.squim_objective_base`. + The weights are under `Creative Commons Attribution 4.0 International License + `__. + + Please refer to :py:class:`SquimObjectiveBundle` for usage instructions. + """ diff --git a/programs/music_separation_code/models/bandit/core/metrics/snr.py b/programs/music_separation_code/models/bandit/core/metrics/snr.py new file mode 100644 index 0000000000000000000000000000000000000000..6b7a168756204ddd8044c7c74e31c15dc1643aa1 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/metrics/snr.py @@ -0,0 +1,127 @@ +from typing import Any, Callable + +import numpy as np +import torch +import torchmetrics as tm +from torch._C import _LinAlgError +from torchmetrics import functional as tmF + + +class SafeSignalDistortionRatio(tm.SignalDistortionRatio): + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def update(self, *args, **kwargs) -> Any: + try: + super().update(*args, **kwargs) + except: + pass + + def compute(self) -> Any: + if self.total == 0: + return torch.tensor(torch.nan) + return super().compute() + + +class BaseChunkMedianSignalRatio(tm.Metric): + def __init__( + self, + func: Callable, + window_size: int, + hop_size: int = None, + zero_mean: bool = False, + ) -> None: + super().__init__() + + # self.zero_mean = zero_mean + self.func = func + self.window_size = window_size + if hop_size is None: + hop_size = window_size + self.hop_size = hop_size + + self.add_state("sum_snr", default=torch.tensor(0.0), dist_reduce_fx="sum") + self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor) -> None: + + n_samples = target.shape[-1] + + n_chunks = int(np.ceil((n_samples - self.window_size) / self.hop_size) + 1) + + snr_chunk = [] + + for i in range(n_chunks): + start = i * self.hop_size + + if n_samples - start < self.window_size: + continue + + end = start + self.window_size + + try: + chunk_snr = self.func(preds[..., start:end], target[..., start:end]) + + # print(preds.shape, chunk_snr.shape) + + if torch.all(torch.isfinite(chunk_snr)): + snr_chunk.append(chunk_snr) + except _LinAlgError: + pass + + snr_chunk = torch.stack(snr_chunk, dim=-1) + snr_batch, _ = torch.nanmedian(snr_chunk, dim=-1) + + self.sum_snr += snr_batch.sum() + self.total += snr_batch.numel() + + def compute(self) -> Any: + return self.sum_snr / self.total + + +class ChunkMedianSignalNoiseRatio(BaseChunkMedianSignalRatio): + def __init__( + self, window_size: int, hop_size: int = None, zero_mean: bool = False + ) -> None: + super().__init__( + func=tmF.signal_noise_ratio, + window_size=window_size, + hop_size=hop_size, + zero_mean=zero_mean, + ) + + +class ChunkMedianScaleInvariantSignalNoiseRatio(BaseChunkMedianSignalRatio): + def __init__( + self, window_size: int, hop_size: int = None, zero_mean: bool = False + ) -> None: + super().__init__( + func=tmF.scale_invariant_signal_noise_ratio, + window_size=window_size, + hop_size=hop_size, + zero_mean=zero_mean, + ) + + +class ChunkMedianSignalDistortionRatio(BaseChunkMedianSignalRatio): + def __init__( + self, window_size: int, hop_size: int = None, zero_mean: bool = False + ) -> None: + super().__init__( + func=tmF.signal_distortion_ratio, + window_size=window_size, + hop_size=hop_size, + zero_mean=zero_mean, + ) + + +class ChunkMedianScaleInvariantSignalDistortionRatio(BaseChunkMedianSignalRatio): + def __init__( + self, window_size: int, hop_size: int = None, zero_mean: bool = False + ) -> None: + super().__init__( + func=tmF.scale_invariant_signal_distortion_ratio, + window_size=window_size, + hop_size=hop_size, + zero_mean=zero_mean, + ) diff --git a/programs/music_separation_code/models/bandit/core/model/__init__.py b/programs/music_separation_code/models/bandit/core/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..54ac48eb69d6f844ba5b73b213eae4cfab157cac --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/__init__.py @@ -0,0 +1,3 @@ +from .bsrnn.wrapper import ( + MultiMaskMultiSourceBandSplitRNNSimple, +) diff --git a/programs/music_separation_code/models/bandit/core/model/_spectral.py b/programs/music_separation_code/models/bandit/core/model/_spectral.py new file mode 100644 index 0000000000000000000000000000000000000000..6af5cbd0dcb6ed0a4babd6b8554184d91c406655 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/_spectral.py @@ -0,0 +1,54 @@ +from typing import Dict, Optional + +import torch +import torchaudio as ta +from torch import nn + + +class _SpectralComponent(nn.Module): + def __init__( + self, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + **kwargs, + ) -> None: + super().__init__() + + assert power is None + + window_fn = torch.__dict__[window_fn] + + self.stft = ta.transforms.Spectrogram( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + pad_mode=pad_mode, + pad=0, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + normalized=normalized, + center=center, + onesided=onesided, + ) + + self.istft = ta.transforms.InverseSpectrogram( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + pad_mode=pad_mode, + pad=0, + window_fn=window_fn, + wkwargs=wkwargs, + normalized=normalized, + center=center, + onesided=onesided, + ) diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/__init__.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c27826197fc8f4eb7a7036d8037966a58d8b38d4 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/__init__.py @@ -0,0 +1,23 @@ +from abc import ABC +from typing import Iterable, Mapping, Union + +from torch import nn + +from models.bandit.core.model.bsrnn.bandsplit import BandSplitModule +from models.bandit.core.model.bsrnn.tfmodel import ( + SeqBandModellingModule, + TransformerTimeFreqModule, +) + + +class BandsplitCoreBase(nn.Module, ABC): + band_split: nn.Module + tf_model: nn.Module + mask_estim: Union[nn.Module, Mapping[str, nn.Module], Iterable[nn.Module]] + + def __init__(self) -> None: + super().__init__() + + @staticmethod + def mask(x, m): + return x * m diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/bandsplit.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/bandsplit.py new file mode 100644 index 0000000000000000000000000000000000000000..43217655bf9a20e67775b0d5914ecd08a9794801 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/bandsplit.py @@ -0,0 +1,135 @@ +from typing import List, Tuple + +import torch +from torch import nn + +from models.bandit.core.model.bsrnn.utils import ( + band_widths_from_specs, + check_no_gap, + check_no_overlap, + check_nonzero_bandwidth, +) + + +class NormFC(nn.Module): + def __init__( + self, + emb_dim: int, + bandwidth: int, + in_channel: int, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + ) -> None: + super().__init__() + + self.treat_channel_as_feature = treat_channel_as_feature + + if normalize_channel_independently: + raise NotImplementedError + + reim = 2 + + self.norm = nn.LayerNorm(in_channel * bandwidth * reim) + + fc_in = bandwidth * reim + + if treat_channel_as_feature: + fc_in *= in_channel + else: + assert emb_dim % in_channel == 0 + emb_dim = emb_dim // in_channel + + self.fc = nn.Linear(fc_in, emb_dim) + + def forward(self, xb): + # xb = (batch, n_time, in_chan, reim * band_width) + + batch, n_time, in_chan, ribw = xb.shape + xb = self.norm(xb.reshape(batch, n_time, in_chan * ribw)) + # (batch, n_time, in_chan * reim * band_width) + + if not self.treat_channel_as_feature: + xb = xb.reshape(batch, n_time, in_chan, ribw) + # (batch, n_time, in_chan, reim * band_width) + + zb = self.fc(xb) + # (batch, n_time, emb_dim) + # OR + # (batch, n_time, in_chan, emb_dim_per_chan) + + if not self.treat_channel_as_feature: + batch, n_time, in_chan, emb_dim_per_chan = zb.shape + # (batch, n_time, in_chan, emb_dim_per_chan) + zb = zb.reshape((batch, n_time, in_chan * emb_dim_per_chan)) + + return zb # (batch, n_time, emb_dim) + + +class BandSplitModule(nn.Module): + def __init__( + self, + band_specs: List[Tuple[float, float]], + emb_dim: int, + in_channel: int, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + ) -> None: + super().__init__() + + check_nonzero_bandwidth(band_specs) + + if require_no_gap: + check_no_gap(band_specs) + + if require_no_overlap: + check_no_overlap(band_specs) + + self.band_specs = band_specs + # list of [fstart, fend) in index. + # Note that fend is exclusive. + self.band_widths = band_widths_from_specs(band_specs) + self.n_bands = len(band_specs) + self.emb_dim = emb_dim + + self.norm_fc_modules = nn.ModuleList( + [ # type: ignore + ( + NormFC( + emb_dim=emb_dim, + bandwidth=bw, + in_channel=in_channel, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + ) + ) + for bw in self.band_widths + ] + ) + + def forward(self, x: torch.Tensor): + # x = complex spectrogram (batch, in_chan, n_freq, n_time) + + batch, in_chan, _, n_time = x.shape + + z = torch.zeros( + size=(batch, self.n_bands, n_time, self.emb_dim), device=x.device + ) + + xr = torch.view_as_real(x) # batch, in_chan, n_freq, n_time, 2 + xr = torch.permute(xr, (0, 3, 1, 4, 2)) # batch, n_time, in_chan, 2, n_freq + batch, n_time, in_chan, reim, band_width = xr.shape + for i, nfm in enumerate(self.norm_fc_modules): + # print(f"bandsplit/band{i:02d}") + fstart, fend = self.band_specs[i] + xb = xr[..., fstart:fend] + # (batch, n_time, in_chan, reim, band_width) + xb = torch.reshape(xb, (batch, n_time, in_chan, -1)) + # (batch, n_time, in_chan, reim * band_width) + # z.append(nfm(xb)) # (batch, n_time, emb_dim) + z[:, i, :, :] = nfm(xb.contiguous()) + + # z = torch.stack(z, dim=1) + + return z diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/core.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/core.py new file mode 100644 index 0000000000000000000000000000000000000000..1dbfb32b33ce3aef3e07e8f4c4a68711e80579a6 --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/core.py @@ -0,0 +1,651 @@ +from typing import Dict, List, Optional, Tuple + +import torch +from torch import nn +from torch.nn import functional as F + +from models.bandit.core.model.bsrnn import BandsplitCoreBase +from models.bandit.core.model.bsrnn.bandsplit import BandSplitModule +from models.bandit.core.model.bsrnn.maskestim import ( + MaskEstimationModule, + OverlappingMaskEstimationModule, +) +from models.bandit.core.model.bsrnn.tfmodel import ( + ConvolutionalTimeFreqModule, + SeqBandModellingModule, + TransformerTimeFreqModule, +) + + +class MultiMaskBandSplitCoreBase(BandsplitCoreBase): + def __init__(self) -> None: + super().__init__() + + def forward(self, x, cond=None, compute_residual: bool = True): + # x = complex spectrogram (batch, in_chan, n_freq, n_time) + # print(x.shape) + batch, in_chan, n_freq, n_time = x.shape + x = torch.reshape(x, (-1, 1, n_freq, n_time)) + + z = self.band_split(x) # (batch, emb_dim, n_band, n_time) + + # if torch.any(torch.isnan(z)): + # raise ValueError("z nan") + + # print(z) + q = self.tf_model(z) # (batch, emb_dim, n_band, n_time) + # print(q) + + # if torch.any(torch.isnan(q)): + # raise ValueError("q nan") + + out = {} + + for stem, mem in self.mask_estim.items(): + m = mem(q, cond=cond) + + # if torch.any(torch.isnan(m)): + # raise ValueError("m nan", stem) + + s = self.mask(x, m) + s = torch.reshape(s, (batch, in_chan, n_freq, n_time)) + out[stem] = s + + return {"spectrogram": out} + + def instantiate_mask_estim( + self, + in_channel: int, + stems: List[str], + band_specs: List[Tuple[float, float]], + emb_dim: int, + mlp_dim: int, + cond_dim: int, + hidden_activation: str, + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + overlapping_band: bool = False, + freq_weights: Optional[List[torch.Tensor]] = None, + n_freq: Optional[int] = None, + use_freq_weights: bool = True, + mult_add_mask: bool = False, + ): + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + + if "mne:+" in stems: + stems = [s for s in stems if s != "mne:+"] + + if overlapping_band: + assert freq_weights is not None + assert n_freq is not None + + if mult_add_mask: + + self.mask_estim = nn.ModuleDict( + { + stem: MultAddMaskEstimationModule( + band_specs=band_specs, + freq_weights=freq_weights, + n_freq=n_freq, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + use_freq_weights=use_freq_weights, + ) + for stem in stems + } + ) + else: + self.mask_estim = nn.ModuleDict( + { + stem: OverlappingMaskEstimationModule( + band_specs=band_specs, + freq_weights=freq_weights, + n_freq=n_freq, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + use_freq_weights=use_freq_weights, + ) + for stem in stems + } + ) + else: + self.mask_estim = nn.ModuleDict( + { + stem: MaskEstimationModule( + band_specs=band_specs, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + for stem in stems + } + ) + + def instantiate_bandsplit( + self, + in_channel: int, + band_specs: List[Tuple[float, float]], + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + emb_dim: int = 128, + ): + self.band_split = BandSplitModule( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + + +class SingleMaskBandsplitCoreBase(BandsplitCoreBase): + def __init__(self, **kwargs) -> None: + super().__init__() + + def forward(self, x): + # x = complex spectrogram (batch, in_chan, n_freq, n_time) + z = self.band_split(x) # (batch, emb_dim, n_band, n_time) + q = self.tf_model(z) # (batch, emb_dim, n_band, n_time) + m = self.mask_estim(q) # (batch, in_chan, n_freq, n_time) + + s = self.mask(x, m) + + return s + + +class SingleMaskBandsplitCoreRNN( + SingleMaskBandsplitCoreBase, +): + def __init__( + self, + in_channel: int, + band_specs: List[Tuple[float, float]], + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + ) -> None: + super().__init__() + self.band_split = BandSplitModule( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + self.tf_model = SeqBandModellingModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ) + self.mask_estim = MaskEstimationModule( + in_channel=in_channel, + band_specs=band_specs, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + + +class SingleMaskBandsplitCoreTransformer( + SingleMaskBandsplitCoreBase, +): + def __init__( + self, + in_channel: int, + band_specs: List[Tuple[float, float]], + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + tf_dropout: float = 0.0, + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + ) -> None: + super().__init__() + self.band_split = BandSplitModule( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + self.tf_model = TransformerTimeFreqModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + dropout=tf_dropout, + ) + self.mask_estim = MaskEstimationModule( + in_channel=in_channel, + band_specs=band_specs, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + + +class MultiSourceMultiMaskBandSplitCoreRNN(MultiMaskBandSplitCoreBase): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: List[Tuple[float, float]], + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + cond_dim: int = 0, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + overlapping_band: bool = False, + freq_weights: Optional[List[torch.Tensor]] = None, + n_freq: Optional[int] = None, + use_freq_weights: bool = True, + mult_add_mask: bool = False, + ) -> None: + + super().__init__() + self.instantiate_bandsplit( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + + self.tf_model = SeqBandModellingModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ) + + self.mult_add_mask = mult_add_mask + + self.instantiate_mask_estim( + in_channel=in_channel, + stems=stems, + band_specs=band_specs, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=overlapping_band, + freq_weights=freq_weights, + n_freq=n_freq, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + @staticmethod + def _mult_add_mask(x, m): + + assert m.ndim == 5 + + mm = m[..., 0] + am = m[..., 1] + + # print(mm.shape, am.shape, x.shape, m.shape) + + return x * mm + am + + def mask(self, x, m): + if self.mult_add_mask: + + return self._mult_add_mask(x, m) + else: + return super().mask(x, m) + + +class MultiSourceMultiMaskBandSplitCoreTransformer( + MultiMaskBandSplitCoreBase, +): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: List[Tuple[float, float]], + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + tf_dropout: float = 0.0, + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + overlapping_band: bool = False, + freq_weights: Optional[List[torch.Tensor]] = None, + n_freq: Optional[int] = None, + use_freq_weights: bool = True, + rnn_type: str = "LSTM", + cond_dim: int = 0, + mult_add_mask: bool = False, + ) -> None: + super().__init__() + self.instantiate_bandsplit( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + self.tf_model = TransformerTimeFreqModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + dropout=tf_dropout, + ) + + self.instantiate_mask_estim( + in_channel=in_channel, + stems=stems, + band_specs=band_specs, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=overlapping_band, + freq_weights=freq_weights, + n_freq=n_freq, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + +class MultiSourceMultiMaskBandSplitCoreConv( + MultiMaskBandSplitCoreBase, +): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: List[Tuple[float, float]], + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + tf_dropout: float = 0.0, + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + overlapping_band: bool = False, + freq_weights: Optional[List[torch.Tensor]] = None, + n_freq: Optional[int] = None, + use_freq_weights: bool = True, + rnn_type: str = "LSTM", + cond_dim: int = 0, + mult_add_mask: bool = False, + ) -> None: + super().__init__() + self.instantiate_bandsplit( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + self.tf_model = ConvolutionalTimeFreqModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + dropout=tf_dropout, + ) + + self.instantiate_mask_estim( + in_channel=in_channel, + stems=stems, + band_specs=band_specs, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=overlapping_band, + freq_weights=freq_weights, + n_freq=n_freq, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + +class PatchingMaskBandsplitCoreBase(MultiMaskBandSplitCoreBase): + def __init__(self) -> None: + super().__init__() + + def mask(self, x, m): + # x.shape = (batch, n_channel, n_freq, n_time) + # m.shape = (kernel_freq, kernel_time, batch, n_channel, n_freq, n_time) + + _, n_channel, kernel_freq, kernel_time, n_freq, n_time = m.shape + padding = ((kernel_freq - 1) // 2, (kernel_time - 1) // 2) + + xf = F.unfold( + x, + kernel_size=(kernel_freq, kernel_time), + padding=padding, + stride=(1, 1), + ) + + xf = xf.view( + -1, + n_channel, + kernel_freq, + kernel_time, + n_freq, + n_time, + ) + + sf = xf * m + + sf = sf.view( + -1, + n_channel * kernel_freq * kernel_time, + n_freq * n_time, + ) + + s = F.fold( + sf, + output_size=(n_freq, n_time), + kernel_size=(kernel_freq, kernel_time), + padding=padding, + stride=(1, 1), + ).view( + -1, + n_channel, + n_freq, + n_time, + ) + + return s + + def old_mask(self, x, m): + # x.shape = (batch, n_channel, n_freq, n_time) + # m.shape = (kernel_freq, kernel_time, batch, n_channel, n_freq, n_time) + + s = torch.zeros_like(x) + + _, n_channel, n_freq, n_time = x.shape + kernel_freq, kernel_time, _, _, _, _ = m.shape + + # print(x.shape, m.shape) + + kernel_freq_half = (kernel_freq - 1) // 2 + kernel_time_half = (kernel_time - 1) // 2 + + for ifreq in range(kernel_freq): + for itime in range(kernel_time): + df, dt = kernel_freq_half - ifreq, kernel_time_half - itime + x = x.roll(shifts=(df, dt), dims=(2, 3)) + + # if `df` > 0: + # x[:, :, :df, :] = 0 + # elif `df` < 0: + # x[:, :, df:, :] = 0 + + # if `dt` > 0: + # x[:, :, :, :dt] = 0 + # elif `dt` < 0: + # x[:, :, :, dt:] = 0 + + fslice = slice(max(0, df), min(n_freq, n_freq + df)) + tslice = slice(max(0, dt), min(n_time, n_time + dt)) + + s[:, :, fslice, tslice] += ( + x[:, :, fslice, tslice] * m[ifreq, itime, :, :, fslice, tslice] + ) + + return s + + +class MultiSourceMultiPatchingMaskBandSplitCoreRNN(PatchingMaskBandsplitCoreBase): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: List[Tuple[float, float]], + mask_kernel_freq: int, + mask_kernel_time: int, + conv_kernel_freq: int, + conv_kernel_time: int, + kernel_norm_mlp_version: int, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + overlapping_band: bool = False, + freq_weights: Optional[List[torch.Tensor]] = None, + n_freq: Optional[int] = None, + ) -> None: + + super().__init__() + self.band_split = BandSplitModule( + in_channel=in_channel, + band_specs=band_specs, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + + self.tf_model = SeqBandModellingModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ) + + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + + if overlapping_band: + assert freq_weights is not None + assert n_freq is not None + self.mask_estim = nn.ModuleDict( + { + stem: PatchingMaskEstimationModule( + band_specs=band_specs, + freq_weights=freq_weights, + n_freq=n_freq, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + mask_kernel_freq=mask_kernel_freq, + mask_kernel_time=mask_kernel_time, + conv_kernel_freq=conv_kernel_freq, + conv_kernel_time=conv_kernel_time, + kernel_norm_mlp_version=kernel_norm_mlp_version, + ) + for stem in stems + } + ) + else: + raise NotImplementedError diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/maskestim.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/maskestim.py new file mode 100644 index 0000000000000000000000000000000000000000..2e40d4faf497c07ed104382b325fe06662f6db7d --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/maskestim.py @@ -0,0 +1,351 @@ +import warnings +from typing import Dict, List, Optional, Tuple, Type + +import torch +from torch import nn +from torch.nn.modules import activation + +from models.bandit.core.model.bsrnn.utils import ( + band_widths_from_specs, + check_no_gap, + check_no_overlap, + check_nonzero_bandwidth, +) + + +class BaseNormMLP(nn.Module): + def __init__( + self, + emb_dim: int, + mlp_dim: int, + bandwidth: int, + in_channel: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs=None, + complex_mask: bool = True, + ): + + super().__init__() + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + self.hidden_activation_kwargs = hidden_activation_kwargs + self.norm = nn.LayerNorm(emb_dim) + self.hidden = torch.jit.script( + nn.Sequential( + nn.Linear(in_features=emb_dim, out_features=mlp_dim), + activation.__dict__[hidden_activation](**self.hidden_activation_kwargs), + ) + ) + + self.bandwidth = bandwidth + self.in_channel = in_channel + + self.complex_mask = complex_mask + self.reim = 2 if complex_mask else 1 + self.glu_mult = 2 + + +class NormMLP(BaseNormMLP): + def __init__( + self, + emb_dim: int, + mlp_dim: int, + bandwidth: int, + in_channel: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs=None, + complex_mask: bool = True, + ) -> None: + super().__init__( + emb_dim=emb_dim, + mlp_dim=mlp_dim, + bandwidth=bandwidth, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + + self.output = torch.jit.script( + nn.Sequential( + nn.Linear( + in_features=mlp_dim, + out_features=bandwidth * in_channel * self.reim * 2, + ), + nn.GLU(dim=-1), + ) + ) + + def reshape_output(self, mb): + # print(mb.shape) + batch, n_time, _ = mb.shape + if self.complex_mask: + mb = mb.reshape( + batch, n_time, self.in_channel, self.bandwidth, self.reim + ).contiguous() + # print(mb.shape) + mb = torch.view_as_complex(mb) # (batch, n_time, in_channel, bandwidth) + else: + mb = mb.reshape(batch, n_time, self.in_channel, self.bandwidth) + + mb = torch.permute(mb, (0, 2, 3, 1)) # (batch, in_channel, bandwidth, n_time) + + return mb + + def forward(self, qb): + # qb = (batch, n_time, emb_dim) + + # if torch.any(torch.isnan(qb)): + # raise ValueError("qb0") + + qb = self.norm(qb) # (batch, n_time, emb_dim) + + # if torch.any(torch.isnan(qb)): + # raise ValueError("qb1") + + qb = self.hidden(qb) # (batch, n_time, mlp_dim) + # if torch.any(torch.isnan(qb)): + # raise ValueError("qb2") + mb = self.output(qb) # (batch, n_time, bandwidth * in_channel * reim) + # if torch.any(torch.isnan(qb)): + # raise ValueError("mb") + mb = self.reshape_output(mb) # (batch, in_channel, bandwidth, n_time) + + return mb + + +class MultAddNormMLP(NormMLP): + def __init__( + self, + emb_dim: int, + mlp_dim: int, + bandwidth: int, + in_channel: "int | None", + hidden_activation: str = "Tanh", + hidden_activation_kwargs=None, + complex_mask: bool = True, + ) -> None: + super().__init__( + emb_dim, + mlp_dim, + bandwidth, + in_channel, + hidden_activation, + hidden_activation_kwargs, + complex_mask, + ) + + self.output2 = torch.jit.script( + nn.Sequential( + nn.Linear( + in_features=mlp_dim, + out_features=bandwidth * in_channel * self.reim * 2, + ), + nn.GLU(dim=-1), + ) + ) + + def forward(self, qb): + + qb = self.norm(qb) # (batch, n_time, emb_dim) + qb = self.hidden(qb) # (batch, n_time, mlp_dim) + mmb = self.output(qb) # (batch, n_time, bandwidth * in_channel * reim) + mmb = self.reshape_output(mmb) # (batch, in_channel, bandwidth, n_time) + amb = self.output2(qb) # (batch, n_time, bandwidth * in_channel * reim) + amb = self.reshape_output(amb) # (batch, in_channel, bandwidth, n_time) + + return mmb, amb + + +class MaskEstimationModuleSuperBase(nn.Module): + pass + + +class MaskEstimationModuleBase(MaskEstimationModuleSuperBase): + def __init__( + self, + band_specs: List[Tuple[float, float]], + emb_dim: int, + mlp_dim: int, + in_channel: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict = None, + complex_mask: bool = True, + norm_mlp_cls: Type[nn.Module] = NormMLP, + norm_mlp_kwargs: Dict = None, + ) -> None: + super().__init__() + + self.band_widths = band_widths_from_specs(band_specs) + self.n_bands = len(band_specs) + + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + + if norm_mlp_kwargs is None: + norm_mlp_kwargs = {} + + self.norm_mlp = nn.ModuleList( + [ + ( + norm_mlp_cls( + bandwidth=self.band_widths[b], + emb_dim=emb_dim, + mlp_dim=mlp_dim, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + **norm_mlp_kwargs, + ) + ) + for b in range(self.n_bands) + ] + ) + + def compute_masks(self, q): + batch, n_bands, n_time, emb_dim = q.shape + + masks = [] + + for b, nmlp in enumerate(self.norm_mlp): + # print(f"maskestim/{b:02d}") + qb = q[:, b, :, :] + mb = nmlp(qb) + masks.append(mb) + + return masks + + +class OverlappingMaskEstimationModule(MaskEstimationModuleBase): + def __init__( + self, + in_channel: int, + band_specs: List[Tuple[float, float]], + freq_weights: List[torch.Tensor], + n_freq: int, + emb_dim: int, + mlp_dim: int, + cond_dim: int = 0, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict = None, + complex_mask: bool = True, + norm_mlp_cls: Type[nn.Module] = NormMLP, + norm_mlp_kwargs: Dict = None, + use_freq_weights: bool = True, + ) -> None: + check_nonzero_bandwidth(band_specs) + check_no_gap(band_specs) + + # if cond_dim > 0: + # raise NotImplementedError + + super().__init__( + band_specs=band_specs, + emb_dim=emb_dim + cond_dim, + mlp_dim=mlp_dim, + in_channel=in_channel, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + norm_mlp_cls=norm_mlp_cls, + norm_mlp_kwargs=norm_mlp_kwargs, + ) + + self.n_freq = n_freq + self.band_specs = band_specs + self.in_channel = in_channel + + if freq_weights is not None: + for i, fw in enumerate(freq_weights): + self.register_buffer(f"freq_weights/{i}", fw) + + self.use_freq_weights = use_freq_weights + else: + self.use_freq_weights = False + + self.cond_dim = cond_dim + + def forward(self, q, cond=None): + # q = (batch, n_bands, n_time, emb_dim) + + batch, n_bands, n_time, emb_dim = q.shape + + if cond is not None: + print(cond) + if cond.ndim == 2: + cond = cond[:, None, None, :].expand(-1, n_bands, n_time, -1) + elif cond.ndim == 3: + assert cond.shape[1] == n_time + else: + raise ValueError(f"Invalid cond shape: {cond.shape}") + + q = torch.cat([q, cond], dim=-1) + elif self.cond_dim > 0: + cond = torch.ones( + (batch, n_bands, n_time, self.cond_dim), + device=q.device, + dtype=q.dtype, + ) + q = torch.cat([q, cond], dim=-1) + else: + pass + + mask_list = self.compute_masks( + q + ) # [n_bands * (batch, in_channel, bandwidth, n_time)] + + masks = torch.zeros( + (batch, self.in_channel, self.n_freq, n_time), + device=q.device, + dtype=mask_list[0].dtype, + ) + + for im, mask in enumerate(mask_list): + fstart, fend = self.band_specs[im] + if self.use_freq_weights: + fw = self.get_buffer(f"freq_weights/{im}")[:, None] + mask = mask * fw + masks[:, :, fstart:fend, :] += mask + + return masks + + +class MaskEstimationModule(OverlappingMaskEstimationModule): + def __init__( + self, + band_specs: List[Tuple[float, float]], + emb_dim: int, + mlp_dim: int, + in_channel: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict = None, + complex_mask: bool = True, + **kwargs, + ) -> None: + check_nonzero_bandwidth(band_specs) + check_no_gap(band_specs) + check_no_overlap(band_specs) + super().__init__( + in_channel=in_channel, + band_specs=band_specs, + freq_weights=None, + n_freq=None, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + + def forward(self, q, cond=None): + # q = (batch, n_bands, n_time, emb_dim) + + masks = self.compute_masks( + q + ) # [n_bands * (batch, in_channel, bandwidth, n_time)] + + # TODO: currently this requires band specs to have no gap and no overlap + masks = torch.concat(masks, dim=2) # (batch, in_channel, n_freq, n_time) + + return masks diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/tfmodel.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/tfmodel.py new file mode 100644 index 0000000000000000000000000000000000000000..f482a118f5a9ac7f9f2d5bc36725155b3d8049db --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/tfmodel.py @@ -0,0 +1,320 @@ +import warnings + +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.modules import rnn + +import torch.backends.cuda + + +class TimeFrequencyModellingModule(nn.Module): + def __init__(self) -> None: + super().__init__() + + +class ResidualRNN(nn.Module): + def __init__( + self, + emb_dim: int, + rnn_dim: int, + bidirectional: bool = True, + rnn_type: str = "LSTM", + use_batch_trick: bool = True, + use_layer_norm: bool = True, + ) -> None: + # n_group is the size of the 2nd dim + super().__init__() + + self.use_layer_norm = use_layer_norm + if use_layer_norm: + self.norm = nn.LayerNorm(emb_dim) + else: + self.norm = nn.GroupNorm(num_groups=emb_dim, num_channels=emb_dim) + + self.rnn = rnn.__dict__[rnn_type]( + input_size=emb_dim, + hidden_size=rnn_dim, + num_layers=1, + batch_first=True, + bidirectional=bidirectional, + ) + + self.fc = nn.Linear( + in_features=rnn_dim * (2 if bidirectional else 1), out_features=emb_dim + ) + + self.use_batch_trick = use_batch_trick + if not self.use_batch_trick: + warnings.warn("NOT USING BATCH TRICK IS EXTREMELY SLOW!!") + + def forward(self, z): + # z = (batch, n_uncrossed, n_across, emb_dim) + + z0 = torch.clone(z) + + # print(z.device) + + if self.use_layer_norm: + z = self.norm(z) # (batch, n_uncrossed, n_across, emb_dim) + else: + z = torch.permute( + z, (0, 3, 1, 2) + ) # (batch, emb_dim, n_uncrossed, n_across) + + z = self.norm(z) # (batch, emb_dim, n_uncrossed, n_across) + + z = torch.permute( + z, (0, 2, 3, 1) + ) # (batch, n_uncrossed, n_across, emb_dim) + + batch, n_uncrossed, n_across, emb_dim = z.shape + + if self.use_batch_trick: + z = torch.reshape(z, (batch * n_uncrossed, n_across, emb_dim)) + + z = self.rnn(z.contiguous())[ + 0 + ] # (batch * n_uncrossed, n_across, dir_rnn_dim) + + z = torch.reshape(z, (batch, n_uncrossed, n_across, -1)) + # (batch, n_uncrossed, n_across, dir_rnn_dim) + else: + # Note: this is EXTREMELY SLOW + zlist = [] + for i in range(n_uncrossed): + zi = self.rnn(z[:, i, :, :])[0] # (batch, n_across, emb_dim) + zlist.append(zi) + + z = torch.stack(zlist, dim=1) # (batch, n_uncrossed, n_across, dir_rnn_dim) + + z = self.fc(z) # (batch, n_uncrossed, n_across, emb_dim) + + z = z + z0 + + return z + + +class SeqBandModellingModule(TimeFrequencyModellingModule): + def __init__( + self, + n_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + parallel_mode=False, + ) -> None: + super().__init__() + self.seqband = nn.ModuleList([]) + + if parallel_mode: + for _ in range(n_modules): + self.seqband.append( + nn.ModuleList( + [ + ResidualRNN( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ), + ResidualRNN( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ), + ] + ) + ) + else: + + for _ in range(2 * n_modules): + self.seqband.append( + ResidualRNN( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ) + ) + + self.parallel_mode = parallel_mode + + def forward(self, z): + # z = (batch, n_bands, n_time, emb_dim) + + if self.parallel_mode: + for sbm_pair in self.seqband: + # z: (batch, n_bands, n_time, emb_dim) + sbm_t, sbm_f = sbm_pair[0], sbm_pair[1] + zt = sbm_t(z) # (batch, n_bands, n_time, emb_dim) + zf = sbm_f(z.transpose(1, 2)) # (batch, n_time, n_bands, emb_dim) + z = zt + zf.transpose(1, 2) + else: + for sbm in self.seqband: + z = sbm(z) + z = z.transpose(1, 2) + + # (batch, n_bands, n_time, emb_dim) + # --> (batch, n_time, n_bands, emb_dim) + # OR + # (batch, n_time, n_bands, emb_dim) + # --> (batch, n_bands, n_time, emb_dim) + + q = z + return q # (batch, n_bands, n_time, emb_dim) + + +class ResidualTransformer(nn.Module): + def __init__( + self, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + dropout: float = 0.0, + ) -> None: + # n_group is the size of the 2nd dim + super().__init__() + + self.tf = nn.TransformerEncoderLayer( + d_model=emb_dim, nhead=4, dim_feedforward=rnn_dim, batch_first=True + ) + + self.is_causal = not bidirectional + self.dropout = dropout + + def forward(self, z): + batch, n_uncrossed, n_across, emb_dim = z.shape + z = torch.reshape(z, (batch * n_uncrossed, n_across, emb_dim)) + z = self.tf( + z, is_causal=self.is_causal + ) # (batch, n_uncrossed, n_across, emb_dim) + z = torch.reshape(z, (batch, n_uncrossed, n_across, emb_dim)) + + return z + + +class TransformerTimeFreqModule(TimeFrequencyModellingModule): + def __init__( + self, + n_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + dropout: float = 0.0, + ) -> None: + super().__init__() + self.norm = nn.LayerNorm(emb_dim) + self.seqband = nn.ModuleList([]) + + for _ in range(2 * n_modules): + self.seqband.append( + ResidualTransformer( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + dropout=dropout, + ) + ) + + def forward(self, z): + # z = (batch, n_bands, n_time, emb_dim) + z = self.norm(z) # (batch, n_bands, n_time, emb_dim) + + for sbm in self.seqband: + z = sbm(z) + z = z.transpose(1, 2) + + # (batch, n_bands, n_time, emb_dim) + # --> (batch, n_time, n_bands, emb_dim) + # OR + # (batch, n_time, n_bands, emb_dim) + # --> (batch, n_bands, n_time, emb_dim) + + q = z + return q # (batch, n_bands, n_time, emb_dim) + + +class ResidualConvolution(nn.Module): + def __init__( + self, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + dropout: float = 0.0, + ) -> None: + # n_group is the size of the 2nd dim + super().__init__() + self.norm = nn.InstanceNorm2d(emb_dim, affine=True) + + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=emb_dim, + out_channels=rnn_dim, + kernel_size=(3, 3), + padding="same", + stride=(1, 1), + ), + nn.Tanhshrink(), + ) + + self.is_causal = not bidirectional + self.dropout = dropout + + self.fc = nn.Conv2d( + in_channels=rnn_dim, + out_channels=emb_dim, + kernel_size=(1, 1), + padding="same", + stride=(1, 1), + ) + + def forward(self, z): + # z = (batch, n_uncrossed, n_across, emb_dim) + + z0 = torch.clone(z) + + z = self.norm(z) # (batch, n_uncrossed, n_across, emb_dim) + z = self.conv(z) # (batch, n_uncrossed, n_across, emb_dim) + z = self.fc(z) # (batch, n_uncrossed, n_across, emb_dim) + z = z + z0 + + return z + + +class ConvolutionalTimeFreqModule(TimeFrequencyModellingModule): + def __init__( + self, + n_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + dropout: float = 0.0, + ) -> None: + super().__init__() + self.seqband = torch.jit.script( + nn.Sequential( + *[ + ResidualConvolution( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + dropout=dropout, + ) + for _ in range(2 * n_modules) + ] + ) + ) + + def forward(self, z): + # z = (batch, n_bands, n_time, emb_dim) + + z = torch.permute(z, (0, 3, 1, 2)) # (batch, emb_dim, n_bands, n_time) + + z = self.seqband(z) # (batch, emb_dim, n_bands, n_time) + + z = torch.permute(z, (0, 2, 3, 1)) # (batch, n_bands, n_time, emb_dim) + + return z diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/utils.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0f470b180303c48e2e5a02b1319bff743c0c83db --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/utils.py @@ -0,0 +1,525 @@ +import os +from abc import abstractmethod +from typing import Any, Callable + +import numpy as np +import torch +from librosa import hz_to_midi, midi_to_hz +from torch import Tensor +from torchaudio import functional as taF +from spafe.fbanks import bark_fbanks +from spafe.utils.converters import erb2hz, hz2bark, hz2erb +from torchaudio.functional.functional import _create_triangular_filterbank + + +def band_widths_from_specs(band_specs): + return [e - i for i, e in band_specs] + + +def check_nonzero_bandwidth(band_specs): + # pprint(band_specs) + for fstart, fend in band_specs: + if fend - fstart <= 0: + raise ValueError("Bands cannot be zero-width") + + +def check_no_overlap(band_specs): + fend_prev = -1 + for fstart_curr, fend_curr in band_specs: + if fstart_curr <= fend_prev: + raise ValueError("Bands cannot overlap") + + +def check_no_gap(band_specs): + fstart, _ = band_specs[0] + assert fstart == 0 + + fend_prev = -1 + for fstart_curr, fend_curr in band_specs: + if fstart_curr - fend_prev > 1: + raise ValueError("Bands cannot leave gap") + fend_prev = fend_curr + + +class BandsplitSpecification: + def __init__(self, nfft: int, fs: int) -> None: + self.fs = fs + self.nfft = nfft + self.nyquist = fs / 2 + self.max_index = nfft // 2 + 1 + + self.split500 = self.hertz_to_index(500) + self.split1k = self.hertz_to_index(1000) + self.split2k = self.hertz_to_index(2000) + self.split4k = self.hertz_to_index(4000) + self.split8k = self.hertz_to_index(8000) + self.split16k = self.hertz_to_index(16000) + self.split20k = self.hertz_to_index(20000) + + self.above20k = [(self.split20k, self.max_index)] + self.above16k = [(self.split16k, self.split20k)] + self.above20k + + def index_to_hertz(self, index: int): + return index * self.fs / self.nfft + + def hertz_to_index(self, hz: float, round: bool = True): + index = hz * self.nfft / self.fs + + if round: + index = int(np.round(index)) + + return index + + def get_band_specs_with_bandwidth(self, start_index, end_index, bandwidth_hz): + band_specs = [] + lower = start_index + + while lower < end_index: + upper = int(np.floor(lower + self.hertz_to_index(bandwidth_hz))) + upper = min(upper, end_index) + + band_specs.append((lower, upper)) + lower = upper + + return band_specs + + @abstractmethod + def get_band_specs(self): + raise NotImplementedError + + +class VocalBandsplitSpecification(BandsplitSpecification): + def __init__(self, nfft: int, fs: int, version: str = "7") -> None: + super().__init__(nfft=nfft, fs=fs) + + self.version = version + + def get_band_specs(self): + return getattr(self, f"version{self.version}")() + + @property + def version1(self): + return self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.max_index, bandwidth_hz=1000 + ) + + def version2(self): + below16k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split16k, bandwidth_hz=1000 + ) + below20k = self.get_band_specs_with_bandwidth( + start_index=self.split16k, end_index=self.split20k, bandwidth_hz=2000 + ) + + return below16k + below20k + self.above20k + + def version3(self): + below8k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + + return below8k + below16k + self.above16k + + def version4(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + + return below1k + below8k + below16k + self.above16k + + def version5(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split16k, bandwidth_hz=1000 + ) + below20k = self.get_band_specs_with_bandwidth( + start_index=self.split16k, end_index=self.split20k, bandwidth_hz=2000 + ) + return below1k + below16k + below20k + self.above20k + + def version6(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split4k, bandwidth_hz=500 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + return below1k + below4k + below8k + below16k + self.above16k + + def version7(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split4k, bandwidth_hz=250 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=500 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=1000 + ) + below20k = self.get_band_specs_with_bandwidth( + start_index=self.split16k, end_index=self.split20k, bandwidth_hz=2000 + ) + return below1k + below4k + below8k + below16k + below20k + self.above20k + + +class OtherBandsplitSpecification(VocalBandsplitSpecification): + def __init__(self, nfft: int, fs: int) -> None: + super().__init__(nfft=nfft, fs=fs, version="7") + + +class BassBandsplitSpecification(BandsplitSpecification): + def __init__(self, nfft: int, fs: int, version: str = "7") -> None: + super().__init__(nfft=nfft, fs=fs) + + def get_band_specs(self): + below500 = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split500, bandwidth_hz=50 + ) + below1k = self.get_band_specs_with_bandwidth( + start_index=self.split500, end_index=self.split1k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split4k, bandwidth_hz=500 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + above16k = [(self.split16k, self.max_index)] + + return below500 + below1k + below4k + below8k + below16k + above16k + + +class DrumBandsplitSpecification(BandsplitSpecification): + def __init__(self, nfft: int, fs: int) -> None: + super().__init__(nfft=nfft, fs=fs) + + def get_band_specs(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=50 + ) + below2k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split2k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split2k, end_index=self.split4k, bandwidth_hz=250 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=500 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=1000 + ) + above16k = [(self.split16k, self.max_index)] + + return below1k + below2k + below4k + below8k + below16k + above16k + + +class PerceptualBandsplitSpecification(BandsplitSpecification): + def __init__( + self, + nfft: int, + fs: int, + fbank_fn: Callable[[int, int, float, float, int], torch.Tensor], + n_bands: int, + f_min: float = 0.0, + f_max: float = None, + ) -> None: + super().__init__(nfft=nfft, fs=fs) + self.n_bands = n_bands + if f_max is None: + f_max = fs / 2 + + self.filterbank = fbank_fn(n_bands, fs, f_min, f_max, self.max_index) + + weight_per_bin = torch.sum(self.filterbank, dim=0, keepdim=True) # (1, n_freqs) + normalized_mel_fb = self.filterbank / weight_per_bin # (n_mels, n_freqs) + + freq_weights = [] + band_specs = [] + for i in range(self.n_bands): + active_bins = torch.nonzero(self.filterbank[i, :]).squeeze().tolist() + if isinstance(active_bins, int): + active_bins = (active_bins, active_bins) + if len(active_bins) == 0: + continue + start_index = active_bins[0] + end_index = active_bins[-1] + 1 + band_specs.append((start_index, end_index)) + freq_weights.append(normalized_mel_fb[i, start_index:end_index]) + + self.freq_weights = freq_weights + self.band_specs = band_specs + + def get_band_specs(self): + return self.band_specs + + def get_freq_weights(self): + return self.freq_weights + + def save_to_file(self, dir_path: str) -> None: + + os.makedirs(dir_path, exist_ok=True) + + import pickle + + with open(os.path.join(dir_path, "mel_bandsplit_spec.pkl"), "wb") as f: + pickle.dump( + { + "band_specs": self.band_specs, + "freq_weights": self.freq_weights, + "filterbank": self.filterbank, + }, + f, + ) + + +def mel_filterbank(n_bands, fs, f_min, f_max, n_freqs): + fb = taF.melscale_fbanks( + n_mels=n_bands, + sample_rate=fs, + f_min=f_min, + f_max=f_max, + n_freqs=n_freqs, + ).T + + fb[0, 0] = 1.0 + + return fb + + +class MelBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=mel_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +def musical_filterbank(n_bands, fs, f_min, f_max, n_freqs, scale="constant"): + + nfft = 2 * (n_freqs - 1) + df = fs / nfft + # init freqs + f_max = f_max or fs / 2 + f_min = f_min or 0 + f_min = fs / nfft + + n_octaves = np.log2(f_max / f_min) + n_octaves_per_band = n_octaves / n_bands + bandwidth_mult = np.power(2.0, n_octaves_per_band) + + low_midi = max(0, hz_to_midi(f_min)) + high_midi = hz_to_midi(f_max) + midi_points = np.linspace(low_midi, high_midi, n_bands) + hz_pts = midi_to_hz(midi_points) + + low_pts = hz_pts / bandwidth_mult + high_pts = hz_pts * bandwidth_mult + + low_bins = np.floor(low_pts / df).astype(int) + high_bins = np.ceil(high_pts / df).astype(int) + + fb = np.zeros((n_bands, n_freqs)) + + for i in range(n_bands): + fb[i, low_bins[i] : high_bins[i] + 1] = 1.0 + + fb[0, : low_bins[0]] = 1.0 + fb[-1, high_bins[-1] + 1 :] = 1.0 + + return torch.as_tensor(fb) + + +class MusicalBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=musical_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +def bark_filterbank(n_bands, fs, f_min, f_max, n_freqs): + nfft = 2 * (n_freqs - 1) + fb, _ = bark_fbanks.bark_filter_banks( + nfilts=n_bands, + nfft=nfft, + fs=fs, + low_freq=f_min, + high_freq=f_max, + scale="constant", + ) + + return torch.as_tensor(fb) + + +class BarkBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=bark_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +def triangular_bark_filterbank(n_bands, fs, f_min, f_max, n_freqs): + + all_freqs = torch.linspace(0, fs // 2, n_freqs) + + # calculate mel freq bins + m_min = hz2bark(f_min) + m_max = hz2bark(f_max) + + m_pts = torch.linspace(m_min, m_max, n_bands + 2) + f_pts = 600 * torch.sinh(m_pts / 6) + + # create filterbank + fb = _create_triangular_filterbank(all_freqs, f_pts) + + fb = fb.T + + first_active_band = torch.nonzero(torch.sum(fb, dim=-1))[0, 0] + first_active_bin = torch.nonzero(fb[first_active_band, :])[0, 0] + + fb[first_active_band, :first_active_bin] = 1.0 + + return fb + + +class TriangularBarkBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=triangular_bark_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +def minibark_filterbank(n_bands, fs, f_min, f_max, n_freqs): + fb = bark_filterbank(n_bands, fs, f_min, f_max, n_freqs) + + fb[fb < np.sqrt(0.5)] = 0.0 + + return fb + + +class MiniBarkBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=minibark_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +def erb_filterbank( + n_bands: int, + fs: int, + f_min: float, + f_max: float, + n_freqs: int, +) -> Tensor: + # freq bins + A = (1000 * np.log(10)) / (24.7 * 4.37) + all_freqs = torch.linspace(0, fs // 2, n_freqs) + + # calculate mel freq bins + m_min = hz2erb(f_min) + m_max = hz2erb(f_max) + + m_pts = torch.linspace(m_min, m_max, n_bands + 2) + f_pts = (torch.pow(10, (m_pts / A)) - 1) / 0.00437 + + # create filterbank + fb = _create_triangular_filterbank(all_freqs, f_pts) + + fb = fb.T + + first_active_band = torch.nonzero(torch.sum(fb, dim=-1))[0, 0] + first_active_bin = torch.nonzero(fb[first_active_band, :])[0, 0] + + fb[first_active_band, :first_active_bin] = 1.0 + + return fb + + +class EquivalentRectangularBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=erb_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +if __name__ == "__main__": + import pandas as pd + + band_defs = [] + + for bands in [VocalBandsplitSpecification]: + band_name = bands.__name__.replace("BandsplitSpecification", "") + + mbs = bands(nfft=2048, fs=44100).get_band_specs() + + for i, (f_min, f_max) in enumerate(mbs): + band_defs.append( + {"band": band_name, "band_index": i, "f_min": f_min, "f_max": f_max} + ) + + df = pd.DataFrame(band_defs) + df.to_csv("vox7bands.csv", index=False) diff --git a/programs/music_separation_code/models/bandit/core/model/bsrnn/wrapper.py b/programs/music_separation_code/models/bandit/core/model/bsrnn/wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..554ea9724c2c8cbc32fbcda46e66947fc0cf34fb --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/model/bsrnn/wrapper.py @@ -0,0 +1,829 @@ +from pprint import pprint +from typing import Dict, List, Optional, Tuple, Union + +import torch +from torch import nn + +from models.bandit.core.model._spectral import _SpectralComponent +from models.bandit.core.model.bsrnn.utils import ( + BarkBandsplitSpecification, + BassBandsplitSpecification, + DrumBandsplitSpecification, + EquivalentRectangularBandsplitSpecification, + MelBandsplitSpecification, + MusicalBandsplitSpecification, + OtherBandsplitSpecification, + TriangularBarkBandsplitSpecification, + VocalBandsplitSpecification, +) +from .core import ( + MultiSourceMultiMaskBandSplitCoreConv, + MultiSourceMultiMaskBandSplitCoreRNN, + MultiSourceMultiMaskBandSplitCoreTransformer, + MultiSourceMultiPatchingMaskBandSplitCoreRNN, + SingleMaskBandsplitCoreRNN, + SingleMaskBandsplitCoreTransformer, +) + +import pytorch_lightning as pl + + +def get_band_specs(band_specs, n_fft, fs, n_bands=None): + if band_specs in ["dnr:speech", "dnr:vox7", "musdb:vocals", "musdb:vox7"]: + bsm = VocalBandsplitSpecification(nfft=n_fft, fs=fs).get_band_specs() + freq_weights = None + overlapping_band = False + elif "tribark" in band_specs: + assert n_bands is not None + specs = TriangularBarkBandsplitSpecification(nfft=n_fft, fs=fs, n_bands=n_bands) + bsm = specs.get_band_specs() + freq_weights = specs.get_freq_weights() + overlapping_band = True + elif "bark" in band_specs: + assert n_bands is not None + specs = BarkBandsplitSpecification(nfft=n_fft, fs=fs, n_bands=n_bands) + bsm = specs.get_band_specs() + freq_weights = specs.get_freq_weights() + overlapping_band = True + elif "erb" in band_specs: + assert n_bands is not None + specs = EquivalentRectangularBandsplitSpecification( + nfft=n_fft, fs=fs, n_bands=n_bands + ) + bsm = specs.get_band_specs() + freq_weights = specs.get_freq_weights() + overlapping_band = True + elif "musical" in band_specs: + assert n_bands is not None + specs = MusicalBandsplitSpecification(nfft=n_fft, fs=fs, n_bands=n_bands) + bsm = specs.get_band_specs() + freq_weights = specs.get_freq_weights() + overlapping_band = True + elif band_specs == "dnr:mel" or "mel" in band_specs: + assert n_bands is not None + specs = MelBandsplitSpecification(nfft=n_fft, fs=fs, n_bands=n_bands) + bsm = specs.get_band_specs() + freq_weights = specs.get_freq_weights() + overlapping_band = True + else: + raise NameError + + return bsm, freq_weights, overlapping_band + + +def get_band_specs_map(band_specs_map, n_fft, fs, n_bands=None): + if band_specs_map == "musdb:all": + bsm = { + "vocals": VocalBandsplitSpecification(nfft=n_fft, fs=fs).get_band_specs(), + "drums": DrumBandsplitSpecification(nfft=n_fft, fs=fs).get_band_specs(), + "bass": BassBandsplitSpecification(nfft=n_fft, fs=fs).get_band_specs(), + "other": OtherBandsplitSpecification(nfft=n_fft, fs=fs).get_band_specs(), + } + freq_weights = None + overlapping_band = False + elif band_specs_map == "dnr:vox7": + bsm_, freq_weights, overlapping_band = get_band_specs( + "dnr:speech", n_fft, fs, n_bands + ) + bsm = {"speech": bsm_, "music": bsm_, "effects": bsm_} + elif "dnr:vox7:" in band_specs_map: + stem = band_specs_map.split(":")[-1] + bsm_, freq_weights, overlapping_band = get_band_specs( + "dnr:speech", n_fft, fs, n_bands + ) + bsm = {stem: bsm_} + else: + raise NameError + + return bsm, freq_weights, overlapping_band + + +class BandSplitWrapperBase(pl.LightningModule): + bsrnn: nn.Module + + def __init__(self, **kwargs): + super().__init__() + + +class SingleMaskMultiSourceBandSplitBase(BandSplitWrapperBase, _SpectralComponent): + def __init__( + self, + band_specs_map: Union[str, Dict[str, List[Tuple[float, float]]]], + fs: int = 44100, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + ) -> None: + super().__init__( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + ) + + if isinstance(band_specs_map, str): + self.band_specs_map, self.freq_weights, self.overlapping_band = ( + get_band_specs_map(band_specs_map, n_fft, fs, n_bands=n_bands) + ) + + self.stems = list(self.band_specs_map.keys()) + + def forward(self, batch): + audio = batch["audio"] + + with torch.no_grad(): + batch["spectrogram"] = {stem: self.stft(audio[stem]) for stem in audio} + + X = batch["spectrogram"]["mixture"] + length = batch["audio"]["mixture"].shape[-1] + + output = {"spectrogram": {}, "audio": {}} + + for stem, bsrnn in self.bsrnn.items(): + S = bsrnn(X) + s = self.istft(S, length) + output["spectrogram"][stem] = S + output["audio"][stem] = s + + return batch, output + + +class MultiMaskMultiSourceBandSplitBase(BandSplitWrapperBase, _SpectralComponent): + def __init__( + self, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + fs: int = 44100, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + ) -> None: + super().__init__( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + ) + + if isinstance(band_specs, str): + self.band_specs, self.freq_weights, self.overlapping_band = get_band_specs( + band_specs, n_fft, fs, n_bands + ) + + self.stems = stems + + def forward(self, batch): + # with torch.no_grad(): + audio = batch["audio"] + cond = batch.get("condition", None) + with torch.no_grad(): + batch["spectrogram"] = {stem: self.stft(audio[stem]) for stem in audio} + + X = batch["spectrogram"]["mixture"] + length = batch["audio"]["mixture"].shape[-1] + + output = self.bsrnn(X, cond=cond) + output["audio"] = {} + + for stem, S in output["spectrogram"].items(): + s = self.istft(S, length) + output["audio"][stem] = s + + return batch, output + + +class MultiMaskMultiSourceBandSplitBaseSimple(BandSplitWrapperBase, _SpectralComponent): + def __init__( + self, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + fs: int = 44100, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + ) -> None: + super().__init__( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + ) + + if isinstance(band_specs, str): + self.band_specs, self.freq_weights, self.overlapping_band = get_band_specs( + band_specs, n_fft, fs, n_bands + ) + + self.stems = stems + + def forward(self, batch): + with torch.no_grad(): + X = self.stft(batch) + length = batch.shape[-1] + output = self.bsrnn(X, cond=None) + res = [] + for stem, S in output["spectrogram"].items(): + s = self.istft(S, length) + res.append(s) + res = torch.stack(res, dim=1) + return res + + +class SingleMaskMultiSourceBandSplitRNN(SingleMaskMultiSourceBandSplitBase): + def __init__( + self, + in_channel: int, + band_specs_map: Union[str, Dict[str, List[Tuple[float, float]]]], + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + ) -> None: + super().__init__( + band_specs_map=band_specs_map, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + ) + + self.bsrnn = nn.ModuleDict( + { + src: SingleMaskBandsplitCoreRNN( + band_specs=specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + for src, specs in self.band_specs_map.items() + } + ) + + +class SingleMaskMultiSourceBandSplitTransformer(SingleMaskMultiSourceBandSplitBase): + def __init__( + self, + in_channel: int, + band_specs_map: Union[str, Dict[str, List[Tuple[float, float]]]], + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + tf_dropout: float = 0.0, + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + ) -> None: + super().__init__( + band_specs_map=band_specs_map, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + ) + + self.bsrnn = nn.ModuleDict( + { + src: SingleMaskBandsplitCoreTransformer( + band_specs=specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + tf_dropout=tf_dropout, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + for src, specs in self.band_specs_map.items() + } + ) + + +class MultiMaskMultiSourceBandSplitRNN(MultiMaskMultiSourceBandSplitBase): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + cond_dim: int = 0, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + use_freq_weights: bool = True, + normalize_input: bool = False, + mult_add_mask: bool = False, + freeze_encoder: bool = False, + ) -> None: + super().__init__( + stems=stems, + band_specs=band_specs, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + n_bands=n_bands, + ) + + self.bsrnn = MultiSourceMultiMaskBandSplitCoreRNN( + stems=stems, + band_specs=self.band_specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=self.overlapping_band, + freq_weights=self.freq_weights, + n_freq=n_fft // 2 + 1, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + self.normalize_input = normalize_input + self.cond_dim = cond_dim + + if freeze_encoder: + for param in self.bsrnn.band_split.parameters(): + param.requires_grad = False + + for param in self.bsrnn.tf_model.parameters(): + param.requires_grad = False + + +class MultiMaskMultiSourceBandSplitRNNSimple(MultiMaskMultiSourceBandSplitBaseSimple): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + cond_dim: int = 0, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + use_freq_weights: bool = True, + normalize_input: bool = False, + mult_add_mask: bool = False, + freeze_encoder: bool = False, + ) -> None: + super().__init__( + stems=stems, + band_specs=band_specs, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + n_bands=n_bands, + ) + + self.bsrnn = MultiSourceMultiMaskBandSplitCoreRNN( + stems=stems, + band_specs=self.band_specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=self.overlapping_band, + freq_weights=self.freq_weights, + n_freq=n_fft // 2 + 1, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + self.normalize_input = normalize_input + self.cond_dim = cond_dim + + if freeze_encoder: + for param in self.bsrnn.band_split.parameters(): + param.requires_grad = False + + for param in self.bsrnn.tf_model.parameters(): + param.requires_grad = False + + +class MultiMaskMultiSourceBandSplitTransformer(MultiMaskMultiSourceBandSplitBase): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + cond_dim: int = 0, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + use_freq_weights: bool = True, + normalize_input: bool = False, + mult_add_mask: bool = False, + ) -> None: + super().__init__( + stems=stems, + band_specs=band_specs, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + n_bands=n_bands, + ) + + self.bsrnn = MultiSourceMultiMaskBandSplitCoreTransformer( + stems=stems, + band_specs=self.band_specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=self.overlapping_band, + freq_weights=self.freq_weights, + n_freq=n_fft // 2 + 1, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + +class MultiMaskMultiSourceBandSplitConv(MultiMaskMultiSourceBandSplitBase): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + cond_dim: int = 0, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + use_freq_weights: bool = True, + normalize_input: bool = False, + mult_add_mask: bool = False, + ) -> None: + super().__init__( + stems=stems, + band_specs=band_specs, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + n_bands=n_bands, + ) + + self.bsrnn = MultiSourceMultiMaskBandSplitCoreConv( + stems=stems, + band_specs=self.band_specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + mlp_dim=mlp_dim, + cond_dim=cond_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=self.overlapping_band, + freq_weights=self.freq_weights, + n_freq=n_fft // 2 + 1, + use_freq_weights=use_freq_weights, + mult_add_mask=mult_add_mask, + ) + + +class PatchingMaskMultiSourceBandSplitRNN(MultiMaskMultiSourceBandSplitBase): + def __init__( + self, + in_channel: int, + stems: List[str], + band_specs: Union[str, List[Tuple[float, float]]], + kernel_norm_mlp_version: int = 1, + mask_kernel_freq: int = 3, + mask_kernel_time: int = 3, + conv_kernel_freq: int = 1, + conv_kernel_time: int = 1, + fs: int = 44100, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + n_bands: int = None, + ) -> None: + super().__init__( + stems=stems, + band_specs=band_specs, + fs=fs, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + n_bands=n_bands, + ) + + self.bsrnn = MultiSourceMultiPatchingMaskBandSplitCoreRNN( + stems=stems, + band_specs=self.band_specs, + in_channel=in_channel, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + overlapping_band=self.overlapping_band, + freq_weights=self.freq_weights, + n_freq=n_fft // 2 + 1, + mask_kernel_freq=mask_kernel_freq, + mask_kernel_time=mask_kernel_time, + conv_kernel_freq=conv_kernel_freq, + conv_kernel_time=conv_kernel_time, + kernel_norm_mlp_version=kernel_norm_mlp_version, + ) diff --git a/programs/music_separation_code/models/bandit/core/utils/__init__.py b/programs/music_separation_code/models/bandit/core/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/programs/music_separation_code/models/bandit/core/utils/audio.py b/programs/music_separation_code/models/bandit/core/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..80a9d420c3c3ac20125d2456d7a860c20085daed --- /dev/null +++ b/programs/music_separation_code/models/bandit/core/utils/audio.py @@ -0,0 +1,412 @@ +from collections import defaultdict + +from tqdm import tqdm +from typing import Callable, Dict, List, Optional, Tuple + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +@torch.jit.script +def merge( + combined: torch.Tensor, + original_batch_size: int, + n_channel: int, + n_chunks: int, + chunk_size: int, +): + combined = torch.reshape( + combined, (original_batch_size, n_chunks, n_channel, chunk_size) + ) + combined = torch.permute(combined, (0, 2, 3, 1)).reshape( + original_batch_size * n_channel, chunk_size, n_chunks + ) + + return combined + + +@torch.jit.script +def unfold( + padded_audio: torch.Tensor, + original_batch_size: int, + n_channel: int, + chunk_size: int, + hop_size: int, +) -> torch.Tensor: + + unfolded_input = F.unfold( + padded_audio[:, :, None, :], kernel_size=(1, chunk_size), stride=(1, hop_size) + ) + + _, _, n_chunks = unfolded_input.shape + unfolded_input = unfolded_input.view( + original_batch_size, n_channel, chunk_size, n_chunks + ) + unfolded_input = torch.permute(unfolded_input, (0, 3, 1, 2)).reshape( + original_batch_size * n_chunks, n_channel, chunk_size + ) + + return unfolded_input + + +@torch.jit.script +# @torch.compile +def merge_chunks_all( + combined: torch.Tensor, + original_batch_size: int, + n_channel: int, + n_samples: int, + n_padded_samples: int, + n_chunks: int, + chunk_size: int, + hop_size: int, + edge_frame_pad_sizes: Tuple[int, int], + standard_window: torch.Tensor, + first_window: torch.Tensor, + last_window: torch.Tensor, +): + combined = merge(combined, original_batch_size, n_channel, n_chunks, chunk_size) + + combined = combined * standard_window[:, None].to(combined.device) + + combined = F.fold( + combined.to(torch.float32), + output_size=(1, n_padded_samples), + kernel_size=(1, chunk_size), + stride=(1, hop_size), + ) + + combined = combined.view(original_batch_size, n_channel, n_padded_samples) + + pad_front, pad_back = edge_frame_pad_sizes + combined = combined[..., pad_front:-pad_back] + + combined = combined[..., :n_samples] + + return combined + + # @torch.jit.script + + +def merge_chunks_edge( + combined: torch.Tensor, + original_batch_size: int, + n_channel: int, + n_samples: int, + n_padded_samples: int, + n_chunks: int, + chunk_size: int, + hop_size: int, + edge_frame_pad_sizes: Tuple[int, int], + standard_window: torch.Tensor, + first_window: torch.Tensor, + last_window: torch.Tensor, +): + combined = merge(combined, original_batch_size, n_channel, n_chunks, chunk_size) + + combined[..., 0] = combined[..., 0] * first_window + combined[..., -1] = combined[..., -1] * last_window + combined[..., 1:-1] = combined[..., 1:-1] * standard_window[:, None] + + combined = F.fold( + combined, + output_size=(1, n_padded_samples), + kernel_size=(1, chunk_size), + stride=(1, hop_size), + ) + + combined = combined.view(original_batch_size, n_channel, n_padded_samples) + + combined = combined[..., :n_samples] + + return combined + + +class BaseFader(nn.Module): + def __init__( + self, + chunk_size_second: float, + hop_size_second: float, + fs: int, + fade_edge_frames: bool, + batch_size: int, + ) -> None: + super().__init__() + + self.chunk_size = int(chunk_size_second * fs) + self.hop_size = int(hop_size_second * fs) + self.overlap_size = self.chunk_size - self.hop_size + self.fade_edge_frames = fade_edge_frames + self.batch_size = batch_size + + # @torch.jit.script + def prepare(self, audio): + + if self.fade_edge_frames: + audio = F.pad(audio, self.edge_frame_pad_sizes, mode="reflect") + + n_samples = audio.shape[-1] + n_chunks = int(np.ceil((n_samples - self.chunk_size) / self.hop_size) + 1) + + padded_size = (n_chunks - 1) * self.hop_size + self.chunk_size + pad_size = padded_size - n_samples + + padded_audio = F.pad(audio, (0, pad_size)) + + return padded_audio, n_chunks + + def forward( + self, + audio: torch.Tensor, + model_fn: Callable[[torch.Tensor], Dict[str, torch.Tensor]], + ): + + original_dtype = audio.dtype + original_device = audio.device + + audio = audio.to("cpu") + + original_batch_size, n_channel, n_samples = audio.shape + padded_audio, n_chunks = self.prepare(audio) + del audio + n_padded_samples = padded_audio.shape[-1] + + if n_channel > 1: + padded_audio = padded_audio.view( + original_batch_size * n_channel, 1, n_padded_samples + ) + + unfolded_input = unfold( + padded_audio, original_batch_size, n_channel, self.chunk_size, self.hop_size + ) + + n_total_chunks, n_channel, chunk_size = unfolded_input.shape + + n_batch = np.ceil(n_total_chunks / self.batch_size).astype(int) + + chunks_in = [ + unfolded_input[b * self.batch_size : (b + 1) * self.batch_size, ...].clone() + for b in range(n_batch) + ] + + all_chunks_out = defaultdict( + lambda: torch.zeros_like(unfolded_input, device="cpu") + ) + + # for b, cin in enumerate(tqdm(chunks_in)): + for b, cin in enumerate(chunks_in): + if torch.allclose(cin, torch.tensor(0.0)): + del cin + continue + + chunks_out = model_fn(cin.to(original_device)) + del cin + for s, c in chunks_out.items(): + all_chunks_out[s][ + b * self.batch_size : (b + 1) * self.batch_size, ... + ] = c.cpu() + del chunks_out + + del unfolded_input + del padded_audio + + if self.fade_edge_frames: + fn = merge_chunks_all + else: + fn = merge_chunks_edge + outputs = {} + + torch.cuda.empty_cache() + + for s, c in all_chunks_out.items(): + combined: torch.Tensor = fn( + c, + original_batch_size, + n_channel, + n_samples, + n_padded_samples, + n_chunks, + self.chunk_size, + self.hop_size, + self.edge_frame_pad_sizes, + self.standard_window, + self.__dict__.get("first_window", self.standard_window), + self.__dict__.get("last_window", self.standard_window), + ) + + outputs[s] = combined.to(dtype=original_dtype, device=original_device) + + return {"audio": outputs} + + # + # def old_forward( + # self, + # audio: torch.Tensor, + # model_fn: Callable[[torch.Tensor], Dict[str, torch.Tensor]], + # ): + # + # n_samples = audio.shape[-1] + # original_batch_size = audio.shape[0] + # + # padded_audio, n_chunks = self.prepare(audio) + # + # ndim = padded_audio.ndim + # broadcaster = [1 for _ in range(ndim - 1)] + [self.chunk_size] + # + # outputs = defaultdict( + # lambda: torch.zeros_like( + # padded_audio, device=audio.device, dtype=torch.float64 + # ) + # ) + # + # all_chunks_out = [] + # len_chunks_in = [] + # + # batch_size_ = int(self.batch_size // original_batch_size) + # for b in range(int(np.ceil(n_chunks / batch_size_))): + # chunks_in = [] + # for j in range(batch_size_): + # i = b * batch_size_ + j + # if i == n_chunks: + # break + # + # start = i * hop_size + # end = start + self.chunk_size + # chunk_in = padded_audio[..., start:end] + # chunks_in.append(chunk_in) + # + # chunks_in = torch.concat(chunks_in, dim=0) + # chunks_out = model_fn(chunks_in) + # all_chunks_out.append(chunks_out) + # len_chunks_in.append(len(chunks_in)) + # + # for b, (chunks_out, lci) in enumerate( + # zip(all_chunks_out, len_chunks_in) + # ): + # for stem in chunks_out: + # for j in range(lci // original_batch_size): + # i = b * batch_size_ + j + # + # if self.fade_edge_frames: + # window = self.standard_window + # else: + # if i == 0: + # window = self.first_window + # elif i == n_chunks - 1: + # window = self.last_window + # else: + # window = self.standard_window + # + # start = i * hop_size + # end = start + self.chunk_size + # + # chunk_out = chunks_out[stem][j * original_batch_size: (j + 1) * original_batch_size, + # ...] + # contrib = window.view(*broadcaster) * chunk_out + # outputs[stem][..., start:end] = ( + # outputs[stem][..., start:end] + contrib + # ) + # + # if self.fade_edge_frames: + # pad_front, pad_back = self.edge_frame_pad_sizes + # outputs = {k: v[..., pad_front:-pad_back] for k, v in + # outputs.items()} + # + # outputs = {k: v[..., :n_samples].to(audio.dtype) for k, v in + # outputs.items()} + # + # return { + # "audio": outputs + # } + + +class LinearFader(BaseFader): + def __init__( + self, + chunk_size_second: float, + hop_size_second: float, + fs: int, + fade_edge_frames: bool = False, + batch_size: int = 1, + ) -> None: + + assert hop_size_second >= chunk_size_second / 2 + + super().__init__( + chunk_size_second=chunk_size_second, + hop_size_second=hop_size_second, + fs=fs, + fade_edge_frames=fade_edge_frames, + batch_size=batch_size, + ) + + in_fade = torch.linspace(0.0, 1.0, self.overlap_size + 1)[:-1] + out_fade = torch.linspace(1.0, 0.0, self.overlap_size + 1)[1:] + center_ones = torch.ones(self.chunk_size - 2 * self.overlap_size) + inout_ones = torch.ones(self.overlap_size) + + # using nn.Parameters allows lightning to take care of devices for us + self.register_buffer( + "standard_window", torch.concat([in_fade, center_ones, out_fade]) + ) + + self.fade_edge_frames = fade_edge_frames + self.edge_frame_pad_size = (self.overlap_size, self.overlap_size) + + if not self.fade_edge_frames: + self.first_window = nn.Parameter( + torch.concat([inout_ones, center_ones, out_fade]), requires_grad=False + ) + self.last_window = nn.Parameter( + torch.concat([in_fade, center_ones, inout_ones]), requires_grad=False + ) + + +class OverlapAddFader(BaseFader): + def __init__( + self, + window_type: str, + chunk_size_second: float, + hop_size_second: float, + fs: int, + batch_size: int = 1, + ) -> None: + assert (chunk_size_second / hop_size_second) % 2 == 0 + assert int(chunk_size_second * fs) % 2 == 0 + + super().__init__( + chunk_size_second=chunk_size_second, + hop_size_second=hop_size_second, + fs=fs, + fade_edge_frames=True, + batch_size=batch_size, + ) + + self.hop_multiplier = self.chunk_size / (2 * self.hop_size) + # print(f"hop multiplier: {self.hop_multiplier}") + + self.edge_frame_pad_sizes = (2 * self.overlap_size, 2 * self.overlap_size) + + self.register_buffer( + "standard_window", + torch.windows.__dict__[window_type]( + self.chunk_size, + sym=False, # dtype=torch.float64 + ) + / self.hop_multiplier, + ) + + +if __name__ == "__main__": + import torchaudio as ta + + fs = 44100 + ola = OverlapAddFader("hann", 6.0, 1.0, fs, batch_size=16) + audio_, _ = ta.load( + "$DATA_ROOT/MUSDB18/HQ/canonical/test/BKS - Too " "Much/vocals.wav" + ) + audio_ = audio_[None, ...] + out = ola(audio_, lambda x: {"stem": x})["audio"]["stem"] + print(torch.allclose(out, audio_)) diff --git a/programs/music_separation_code/models/bandit/model_from_config.py b/programs/music_separation_code/models/bandit/model_from_config.py new file mode 100644 index 0000000000000000000000000000000000000000..9735bda0592a4e9380e918022e8ffd141f15b383 --- /dev/null +++ b/programs/music_separation_code/models/bandit/model_from_config.py @@ -0,0 +1,29 @@ +import sys +import os.path +import torch + +code_path = os.path.dirname(os.path.abspath(__file__)) + "/" +sys.path.append(code_path) + +import yaml +from ml_collections import ConfigDict + +torch.set_float32_matmul_precision("medium") + + +def get_model( + config_path, + weights_path, + device, +): + from models.bandit.core.model import MultiMaskMultiSourceBandSplitRNNSimple + + f = open(config_path) + config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader)) + f.close() + + model = MultiMaskMultiSourceBandSplitRNNSimple(**config.model) + d = torch.load(code_path + "model_bandit_plus_dnr_sdr_11.47.chpt") + model.load_state_dict(d) + model.to(device) + return model, config diff --git a/programs/music_separation_code/models/bandit_v2/bandit.py b/programs/music_separation_code/models/bandit_v2/bandit.py new file mode 100644 index 0000000000000000000000000000000000000000..fba32962f242ce44f64ab7d8ca433290cbe6d7e2 --- /dev/null +++ b/programs/music_separation_code/models/bandit_v2/bandit.py @@ -0,0 +1,363 @@ +from typing import Dict, List, Optional + +import torch +import torchaudio as ta +from torch import nn +import pytorch_lightning as pl + +from .bandsplit import BandSplitModule +from .maskestim import OverlappingMaskEstimationModule +from .tfmodel import SeqBandModellingModule +from .utils import MusicalBandsplitSpecification + + +class BaseEndToEndModule(pl.LightningModule): + def __init__( + self, + ) -> None: + super().__init__() + + +class BaseBandit(BaseEndToEndModule): + def __init__( + self, + in_channels: int, + fs: int, + band_type: str = "musical", + n_bands: int = 64, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + ): + super().__init__() + + self.in_channels = in_channels + + self.instantitate_spectral( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + normalized=normalized, + center=center, + pad_mode=pad_mode, + onesided=onesided, + ) + + self.instantiate_bandsplit( + in_channels=in_channels, + band_type=band_type, + n_bands=n_bands, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + n_fft=n_fft, + fs=fs, + ) + + self.instantiate_tf_modelling( + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ) + + def instantitate_spectral( + self, + n_fft: int = 2048, + win_length: Optional[int] = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Optional[Dict] = None, + power: Optional[int] = None, + normalized: bool = True, + center: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + ): + assert power is None + + window_fn = torch.__dict__[window_fn] + + self.stft = ta.transforms.Spectrogram( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + pad_mode=pad_mode, + pad=0, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + normalized=normalized, + center=center, + onesided=onesided, + ) + + self.istft = ta.transforms.InverseSpectrogram( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + pad_mode=pad_mode, + pad=0, + window_fn=window_fn, + wkwargs=wkwargs, + normalized=normalized, + center=center, + onesided=onesided, + ) + + def instantiate_bandsplit( + self, + in_channels: int, + band_type: str = "musical", + n_bands: int = 64, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + emb_dim: int = 128, + n_fft: int = 2048, + fs: int = 44100, + ): + assert band_type == "musical" + + self.band_specs = MusicalBandsplitSpecification( + nfft=n_fft, fs=fs, n_bands=n_bands + ) + + self.band_split = BandSplitModule( + in_channels=in_channels, + band_specs=self.band_specs.get_band_specs(), + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + emb_dim=emb_dim, + ) + + def instantiate_tf_modelling( + self, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + ): + try: + self.tf_model = torch.compile( + SeqBandModellingModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ), + disable=True, + ) + except Exception as e: + self.tf_model = SeqBandModellingModule( + n_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ) + + def mask(self, x, m): + return x * m + + def forward(self, batch, mode="train"): + # Model takes mono as input we give stereo, so we do process of each channel independently + init_shape = batch.shape + if not isinstance(batch, dict): + mono = batch.view(-1, 1, batch.shape[-1]) + batch = {"mixture": {"audio": mono}} + + with torch.no_grad(): + mixture = batch["mixture"]["audio"] + + x = self.stft(mixture) + batch["mixture"]["spectrogram"] = x + + if "sources" in batch.keys(): + for stem in batch["sources"].keys(): + s = batch["sources"][stem]["audio"] + s = self.stft(s) + batch["sources"][stem]["spectrogram"] = s + + batch = self.separate(batch) + + if 1: + b = [] + for s in self.stems: + # We need to obtain stereo again + r = batch["estimates"][s]["audio"].view( + -1, init_shape[1], init_shape[2] + ) + b.append(r) + # And we need to return back tensor and not independent stems + batch = torch.stack(b, dim=1) + return batch + + def encode(self, batch): + x = batch["mixture"]["spectrogram"] + length = batch["mixture"]["audio"].shape[-1] + + z = self.band_split(x) # (batch, emb_dim, n_band, n_time) + q = self.tf_model(z) # (batch, emb_dim, n_band, n_time) + + return x, q, length + + def separate(self, batch): + raise NotImplementedError + + +class Bandit(BaseBandit): + def __init__( + self, + in_channels: int, + stems: List[str], + band_type: str = "musical", + n_bands: int = 64, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + n_sqm_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + mlp_dim: int = 512, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict | None = None, + complex_mask: bool = True, + use_freq_weights: bool = True, + n_fft: int = 2048, + win_length: int | None = 2048, + hop_length: int = 512, + window_fn: str = "hann_window", + wkwargs: Dict | None = None, + power: int | None = None, + center: bool = True, + normalized: bool = True, + pad_mode: str = "constant", + onesided: bool = True, + fs: int = 44100, + stft_precisions="32", + bandsplit_precisions="bf16", + tf_model_precisions="bf16", + mask_estim_precisions="bf16", + ): + super().__init__( + in_channels=in_channels, + band_type=band_type, + n_bands=n_bands, + require_no_overlap=require_no_overlap, + require_no_gap=require_no_gap, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + n_sqm_modules=n_sqm_modules, + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + window_fn=window_fn, + wkwargs=wkwargs, + power=power, + center=center, + normalized=normalized, + pad_mode=pad_mode, + onesided=onesided, + fs=fs, + ) + + self.stems = stems + + self.instantiate_mask_estim( + in_channels=in_channels, + stems=stems, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + n_freq=n_fft // 2 + 1, + use_freq_weights=use_freq_weights, + ) + + def instantiate_mask_estim( + self, + in_channels: int, + stems: List[str], + emb_dim: int, + mlp_dim: int, + hidden_activation: str, + hidden_activation_kwargs: Optional[Dict] = None, + complex_mask: bool = True, + n_freq: Optional[int] = None, + use_freq_weights: bool = False, + ): + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + + assert n_freq is not None + + self.mask_estim = nn.ModuleDict( + { + stem: OverlappingMaskEstimationModule( + band_specs=self.band_specs.get_band_specs(), + freq_weights=self.band_specs.get_freq_weights(), + n_freq=n_freq, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + in_channels=in_channels, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + use_freq_weights=use_freq_weights, + ) + for stem in stems + } + ) + + def separate(self, batch): + batch["estimates"] = {} + + x, q, length = self.encode(batch) + + for stem, mem in self.mask_estim.items(): + m = mem(q) + + s = self.mask(x, m.to(x.dtype)) + s = torch.reshape(s, x.shape) + batch["estimates"][stem] = { + "audio": self.istft(s, length), + "spectrogram": s, + } + + return batch diff --git a/programs/music_separation_code/models/bandit_v2/bandsplit.py b/programs/music_separation_code/models/bandit_v2/bandsplit.py new file mode 100644 index 0000000000000000000000000000000000000000..a14ea52bfa318264d536c9f934d0e28db63e15dc --- /dev/null +++ b/programs/music_separation_code/models/bandit_v2/bandsplit.py @@ -0,0 +1,130 @@ +from typing import List, Tuple + +import torch +from torch import nn +from torch.utils.checkpoint import checkpoint_sequential + +from .utils import ( + band_widths_from_specs, + check_no_gap, + check_no_overlap, + check_nonzero_bandwidth, +) + + +class NormFC(nn.Module): + def __init__( + self, + emb_dim: int, + bandwidth: int, + in_channels: int, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + ) -> None: + super().__init__() + + if not treat_channel_as_feature: + raise NotImplementedError + + self.treat_channel_as_feature = treat_channel_as_feature + + if normalize_channel_independently: + raise NotImplementedError + + reim = 2 + + norm = nn.LayerNorm(in_channels * bandwidth * reim) + + fc_in = bandwidth * reim + + if treat_channel_as_feature: + fc_in *= in_channels + else: + assert emb_dim % in_channels == 0 + emb_dim = emb_dim // in_channels + + fc = nn.Linear(fc_in, emb_dim) + + self.combined = nn.Sequential(norm, fc) + + def forward(self, xb): + return checkpoint_sequential(self.combined, 1, xb, use_reentrant=False) + + +class BandSplitModule(nn.Module): + def __init__( + self, + band_specs: List[Tuple[float, float]], + emb_dim: int, + in_channels: int, + require_no_overlap: bool = False, + require_no_gap: bool = True, + normalize_channel_independently: bool = False, + treat_channel_as_feature: bool = True, + ) -> None: + super().__init__() + + check_nonzero_bandwidth(band_specs) + + if require_no_gap: + check_no_gap(band_specs) + + if require_no_overlap: + check_no_overlap(band_specs) + + self.band_specs = band_specs + # list of [fstart, fend) in index. + # Note that fend is exclusive. + self.band_widths = band_widths_from_specs(band_specs) + self.n_bands = len(band_specs) + self.emb_dim = emb_dim + + try: + self.norm_fc_modules = nn.ModuleList( + [ # type: ignore + torch.compile( + NormFC( + emb_dim=emb_dim, + bandwidth=bw, + in_channels=in_channels, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + ), + disable=True, + ) + for bw in self.band_widths + ] + ) + except Exception as e: + self.norm_fc_modules = nn.ModuleList( + [ # type: ignore + NormFC( + emb_dim=emb_dim, + bandwidth=bw, + in_channels=in_channels, + normalize_channel_independently=normalize_channel_independently, + treat_channel_as_feature=treat_channel_as_feature, + ) + for bw in self.band_widths + ] + ) + + def forward(self, x: torch.Tensor): + # x = complex spectrogram (batch, in_chan, n_freq, n_time) + + batch, in_chan, band_width, n_time = x.shape + + z = torch.zeros( + size=(batch, self.n_bands, n_time, self.emb_dim), device=x.device + ) + + x = torch.permute(x, (0, 3, 1, 2)).contiguous() + + for i, nfm in enumerate(self.norm_fc_modules): + fstart, fend = self.band_specs[i] + xb = x[:, :, :, fstart:fend] + xb = torch.view_as_real(xb) + xb = torch.reshape(xb, (batch, n_time, -1)) + z[:, i, :, :] = nfm(xb) + + return z diff --git a/programs/music_separation_code/models/bandit_v2/film.py b/programs/music_separation_code/models/bandit_v2/film.py new file mode 100644 index 0000000000000000000000000000000000000000..253594ad0154cee4ef7467036ed71f4d5f836db8 --- /dev/null +++ b/programs/music_separation_code/models/bandit_v2/film.py @@ -0,0 +1,23 @@ +from torch import nn +import torch + + +class FiLM(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, gamma, beta): + return gamma * x + beta + + +class BTFBroadcastedFiLM(nn.Module): + def __init__(self): + super().__init__() + self.film = FiLM() + + def forward(self, x, gamma, beta): + + gamma = gamma[None, None, None, :] + beta = beta[None, None, None, :] + + return self.film(x, gamma, beta) diff --git a/programs/music_separation_code/models/bandit_v2/maskestim.py b/programs/music_separation_code/models/bandit_v2/maskestim.py new file mode 100644 index 0000000000000000000000000000000000000000..65215d86a5e94dafdb71744aafadf7aaab93330d --- /dev/null +++ b/programs/music_separation_code/models/bandit_v2/maskestim.py @@ -0,0 +1,281 @@ +from typing import Dict, List, Optional, Tuple, Type + +import torch +from torch import nn +from torch.nn.modules import activation +from torch.utils.checkpoint import checkpoint_sequential + +from .utils import ( + band_widths_from_specs, + check_no_gap, + check_no_overlap, + check_nonzero_bandwidth, +) + + +class BaseNormMLP(nn.Module): + def __init__( + self, + emb_dim: int, + mlp_dim: int, + bandwidth: int, + in_channels: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs=None, + complex_mask: bool = True, + ): + super().__init__() + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + self.hidden_activation_kwargs = hidden_activation_kwargs + self.norm = nn.LayerNorm(emb_dim) + self.hidden = nn.Sequential( + nn.Linear(in_features=emb_dim, out_features=mlp_dim), + activation.__dict__[hidden_activation](**self.hidden_activation_kwargs), + ) + + self.bandwidth = bandwidth + self.in_channels = in_channels + + self.complex_mask = complex_mask + self.reim = 2 if complex_mask else 1 + self.glu_mult = 2 + + +class NormMLP(BaseNormMLP): + def __init__( + self, + emb_dim: int, + mlp_dim: int, + bandwidth: int, + in_channels: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs=None, + complex_mask: bool = True, + ) -> None: + super().__init__( + emb_dim=emb_dim, + mlp_dim=mlp_dim, + bandwidth=bandwidth, + in_channels=in_channels, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + + self.output = nn.Sequential( + nn.Linear( + in_features=mlp_dim, + out_features=bandwidth * in_channels * self.reim * 2, + ), + nn.GLU(dim=-1), + ) + + try: + self.combined = torch.compile( + nn.Sequential(self.norm, self.hidden, self.output), disable=True + ) + except Exception as e: + self.combined = nn.Sequential(self.norm, self.hidden, self.output) + + def reshape_output(self, mb): + # print(mb.shape) + batch, n_time, _ = mb.shape + if self.complex_mask: + mb = mb.reshape( + batch, n_time, self.in_channels, self.bandwidth, self.reim + ).contiguous() + # print(mb.shape) + mb = torch.view_as_complex(mb) # (batch, n_time, in_channels, bandwidth) + else: + mb = mb.reshape(batch, n_time, self.in_channels, self.bandwidth) + + mb = torch.permute(mb, (0, 2, 3, 1)) # (batch, in_channels, bandwidth, n_time) + + return mb + + def forward(self, qb): + # qb = (batch, n_time, emb_dim) + # qb = self.norm(qb) # (batch, n_time, emb_dim) + # qb = self.hidden(qb) # (batch, n_time, mlp_dim) + # mb = self.output(qb) # (batch, n_time, bandwidth * in_channels * reim) + + mb = checkpoint_sequential(self.combined, 2, qb, use_reentrant=False) + mb = self.reshape_output(mb) # (batch, in_channels, bandwidth, n_time) + + return mb + + +class MaskEstimationModuleSuperBase(nn.Module): + pass + + +class MaskEstimationModuleBase(MaskEstimationModuleSuperBase): + def __init__( + self, + band_specs: List[Tuple[float, float]], + emb_dim: int, + mlp_dim: int, + in_channels: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict = None, + complex_mask: bool = True, + norm_mlp_cls: Type[nn.Module] = NormMLP, + norm_mlp_kwargs: Dict = None, + ) -> None: + super().__init__() + + self.band_widths = band_widths_from_specs(band_specs) + self.n_bands = len(band_specs) + + if hidden_activation_kwargs is None: + hidden_activation_kwargs = {} + + if norm_mlp_kwargs is None: + norm_mlp_kwargs = {} + + self.norm_mlp = nn.ModuleList( + [ + norm_mlp_cls( + bandwidth=self.band_widths[b], + emb_dim=emb_dim, + mlp_dim=mlp_dim, + in_channels=in_channels, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + **norm_mlp_kwargs, + ) + for b in range(self.n_bands) + ] + ) + + def compute_masks(self, q): + batch, n_bands, n_time, emb_dim = q.shape + + masks = [] + + for b, nmlp in enumerate(self.norm_mlp): + # print(f"maskestim/{b:02d}") + qb = q[:, b, :, :] + mb = nmlp(qb) + masks.append(mb) + + return masks + + def compute_mask(self, q, b): + batch, n_bands, n_time, emb_dim = q.shape + qb = q[:, b, :, :] + mb = self.norm_mlp[b](qb) + return mb + + +class OverlappingMaskEstimationModule(MaskEstimationModuleBase): + def __init__( + self, + in_channels: int, + band_specs: List[Tuple[float, float]], + freq_weights: List[torch.Tensor], + n_freq: int, + emb_dim: int, + mlp_dim: int, + cond_dim: int = 0, + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict = None, + complex_mask: bool = True, + norm_mlp_cls: Type[nn.Module] = NormMLP, + norm_mlp_kwargs: Dict = None, + use_freq_weights: bool = False, + ) -> None: + check_nonzero_bandwidth(band_specs) + check_no_gap(band_specs) + + if cond_dim > 0: + raise NotImplementedError + + super().__init__( + band_specs=band_specs, + emb_dim=emb_dim + cond_dim, + mlp_dim=mlp_dim, + in_channels=in_channels, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + norm_mlp_cls=norm_mlp_cls, + norm_mlp_kwargs=norm_mlp_kwargs, + ) + + self.n_freq = n_freq + self.band_specs = band_specs + self.in_channels = in_channels + + if freq_weights is not None and use_freq_weights: + for i, fw in enumerate(freq_weights): + self.register_buffer(f"freq_weights/{i}", fw) + + self.use_freq_weights = use_freq_weights + else: + self.use_freq_weights = False + + def forward(self, q): + # q = (batch, n_bands, n_time, emb_dim) + + batch, n_bands, n_time, emb_dim = q.shape + + masks = torch.zeros( + (batch, self.in_channels, self.n_freq, n_time), + device=q.device, + dtype=torch.complex64, + ) + + for im in range(n_bands): + fstart, fend = self.band_specs[im] + + mask = self.compute_mask(q, im) + + if self.use_freq_weights: + fw = self.get_buffer(f"freq_weights/{im}")[:, None] + mask = mask * fw + masks[:, :, fstart:fend, :] += mask + + return masks + + +class MaskEstimationModule(OverlappingMaskEstimationModule): + def __init__( + self, + band_specs: List[Tuple[float, float]], + emb_dim: int, + mlp_dim: int, + in_channels: Optional[int], + hidden_activation: str = "Tanh", + hidden_activation_kwargs: Dict = None, + complex_mask: bool = True, + **kwargs, + ) -> None: + check_nonzero_bandwidth(band_specs) + check_no_gap(band_specs) + check_no_overlap(band_specs) + super().__init__( + in_channels=in_channels, + band_specs=band_specs, + freq_weights=None, + n_freq=None, + emb_dim=emb_dim, + mlp_dim=mlp_dim, + hidden_activation=hidden_activation, + hidden_activation_kwargs=hidden_activation_kwargs, + complex_mask=complex_mask, + ) + + def forward(self, q, cond=None): + # q = (batch, n_bands, n_time, emb_dim) + + masks = self.compute_masks( + q + ) # [n_bands * (batch, in_channels, bandwidth, n_time)] + + # TODO: currently this requires band specs to have no gap and no overlap + masks = torch.concat(masks, dim=2) # (batch, in_channels, n_freq, n_time) + + return masks diff --git a/programs/music_separation_code/models/bandit_v2/tfmodel.py b/programs/music_separation_code/models/bandit_v2/tfmodel.py new file mode 100644 index 0000000000000000000000000000000000000000..21aef03d1f0e814c20db05fe7d14f8019f07713b --- /dev/null +++ b/programs/music_separation_code/models/bandit_v2/tfmodel.py @@ -0,0 +1,145 @@ +import warnings + +import torch +import torch.backends.cuda +from torch import nn +from torch.nn.modules import rnn +from torch.utils.checkpoint import checkpoint_sequential + + +class TimeFrequencyModellingModule(nn.Module): + def __init__(self) -> None: + super().__init__() + + +class ResidualRNN(nn.Module): + def __init__( + self, + emb_dim: int, + rnn_dim: int, + bidirectional: bool = True, + rnn_type: str = "LSTM", + use_batch_trick: bool = True, + use_layer_norm: bool = True, + ) -> None: + # n_group is the size of the 2nd dim + super().__init__() + + assert use_layer_norm + assert use_batch_trick + + self.use_layer_norm = use_layer_norm + self.norm = nn.LayerNorm(emb_dim) + self.rnn = rnn.__dict__[rnn_type]( + input_size=emb_dim, + hidden_size=rnn_dim, + num_layers=1, + batch_first=True, + bidirectional=bidirectional, + ) + + self.fc = nn.Linear( + in_features=rnn_dim * (2 if bidirectional else 1), out_features=emb_dim + ) + + self.use_batch_trick = use_batch_trick + if not self.use_batch_trick: + warnings.warn("NOT USING BATCH TRICK IS EXTREMELY SLOW!!") + + def forward(self, z): + # z = (batch, n_uncrossed, n_across, emb_dim) + + z0 = torch.clone(z) + z = self.norm(z) + + batch, n_uncrossed, n_across, emb_dim = z.shape + z = torch.reshape(z, (batch * n_uncrossed, n_across, emb_dim)) + z = self.rnn(z)[0] + z = torch.reshape(z, (batch, n_uncrossed, n_across, -1)) + + z = self.fc(z) # (batch, n_uncrossed, n_across, emb_dim) + + z = z + z0 + + return z + + +class Transpose(nn.Module): + def __init__(self, dim0: int, dim1: int) -> None: + super().__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, z): + return z.transpose(self.dim0, self.dim1) + + +class SeqBandModellingModule(TimeFrequencyModellingModule): + def __init__( + self, + n_modules: int = 12, + emb_dim: int = 128, + rnn_dim: int = 256, + bidirectional: bool = True, + rnn_type: str = "LSTM", + parallel_mode=False, + ) -> None: + super().__init__() + + self.n_modules = n_modules + + if parallel_mode: + self.seqband = nn.ModuleList([]) + for _ in range(n_modules): + self.seqband.append( + nn.ModuleList( + [ + ResidualRNN( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ), + ResidualRNN( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ), + ] + ) + ) + else: + seqband = [] + for _ in range(2 * n_modules): + seqband += [ + ResidualRNN( + emb_dim=emb_dim, + rnn_dim=rnn_dim, + bidirectional=bidirectional, + rnn_type=rnn_type, + ), + Transpose(1, 2), + ] + + self.seqband = nn.Sequential(*seqband) + + self.parallel_mode = parallel_mode + + def forward(self, z): + # z = (batch, n_bands, n_time, emb_dim) + + if self.parallel_mode: + for sbm_pair in self.seqband: + # z: (batch, n_bands, n_time, emb_dim) + sbm_t, sbm_f = sbm_pair[0], sbm_pair[1] + zt = sbm_t(z) # (batch, n_bands, n_time, emb_dim) + zf = sbm_f(z.transpose(1, 2)) # (batch, n_time, n_bands, emb_dim) + z = zt + zf.transpose(1, 2) + else: + z = checkpoint_sequential( + self.seqband, self.n_modules, z, use_reentrant=False + ) + + q = z + return q # (batch, n_bands, n_time, emb_dim) diff --git a/programs/music_separation_code/models/bandit_v2/utils.py b/programs/music_separation_code/models/bandit_v2/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ad4eab5d8c5b5396ed717f5b9c365a6900eddd2f --- /dev/null +++ b/programs/music_separation_code/models/bandit_v2/utils.py @@ -0,0 +1,523 @@ +import os +from abc import abstractmethod +from typing import Callable + +import numpy as np +import torch +from librosa import hz_to_midi, midi_to_hz +from torchaudio import functional as taF + +# from spafe.fbanks import bark_fbanks +# from spafe.utils.converters import erb2hz, hz2bark, hz2erb + + +def band_widths_from_specs(band_specs): + return [e - i for i, e in band_specs] + + +def check_nonzero_bandwidth(band_specs): + # pprint(band_specs) + for fstart, fend in band_specs: + if fend - fstart <= 0: + raise ValueError("Bands cannot be zero-width") + + +def check_no_overlap(band_specs): + fend_prev = -1 + for fstart_curr, fend_curr in band_specs: + if fstart_curr <= fend_prev: + raise ValueError("Bands cannot overlap") + + +def check_no_gap(band_specs): + fstart, _ = band_specs[0] + assert fstart == 0 + + fend_prev = -1 + for fstart_curr, fend_curr in band_specs: + if fstart_curr - fend_prev > 1: + raise ValueError("Bands cannot leave gap") + fend_prev = fend_curr + + +class BandsplitSpecification: + def __init__(self, nfft: int, fs: int) -> None: + self.fs = fs + self.nfft = nfft + self.nyquist = fs / 2 + self.max_index = nfft // 2 + 1 + + self.split500 = self.hertz_to_index(500) + self.split1k = self.hertz_to_index(1000) + self.split2k = self.hertz_to_index(2000) + self.split4k = self.hertz_to_index(4000) + self.split8k = self.hertz_to_index(8000) + self.split16k = self.hertz_to_index(16000) + self.split20k = self.hertz_to_index(20000) + + self.above20k = [(self.split20k, self.max_index)] + self.above16k = [(self.split16k, self.split20k)] + self.above20k + + def index_to_hertz(self, index: int): + return index * self.fs / self.nfft + + def hertz_to_index(self, hz: float, round: bool = True): + index = hz * self.nfft / self.fs + + if round: + index = int(np.round(index)) + + return index + + def get_band_specs_with_bandwidth(self, start_index, end_index, bandwidth_hz): + band_specs = [] + lower = start_index + + while lower < end_index: + upper = int(np.floor(lower + self.hertz_to_index(bandwidth_hz))) + upper = min(upper, end_index) + + band_specs.append((lower, upper)) + lower = upper + + return band_specs + + @abstractmethod + def get_band_specs(self): + raise NotImplementedError + + +class VocalBandsplitSpecification(BandsplitSpecification): + def __init__(self, nfft: int, fs: int, version: str = "7") -> None: + super().__init__(nfft=nfft, fs=fs) + + self.version = version + + def get_band_specs(self): + return getattr(self, f"version{self.version}")() + + @property + def version1(self): + return self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.max_index, bandwidth_hz=1000 + ) + + def version2(self): + below16k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split16k, bandwidth_hz=1000 + ) + below20k = self.get_band_specs_with_bandwidth( + start_index=self.split16k, end_index=self.split20k, bandwidth_hz=2000 + ) + + return below16k + below20k + self.above20k + + def version3(self): + below8k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + + return below8k + below16k + self.above16k + + def version4(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + + return below1k + below8k + below16k + self.above16k + + def version5(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split16k, bandwidth_hz=1000 + ) + below20k = self.get_band_specs_with_bandwidth( + start_index=self.split16k, end_index=self.split20k, bandwidth_hz=2000 + ) + return below1k + below16k + below20k + self.above20k + + def version6(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split4k, bandwidth_hz=500 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + return below1k + below4k + below8k + below16k + self.above16k + + def version7(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split4k, bandwidth_hz=250 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=500 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=1000 + ) + below20k = self.get_band_specs_with_bandwidth( + start_index=self.split16k, end_index=self.split20k, bandwidth_hz=2000 + ) + return below1k + below4k + below8k + below16k + below20k + self.above20k + + +class OtherBandsplitSpecification(VocalBandsplitSpecification): + def __init__(self, nfft: int, fs: int) -> None: + super().__init__(nfft=nfft, fs=fs, version="7") + + +class BassBandsplitSpecification(BandsplitSpecification): + def __init__(self, nfft: int, fs: int, version: str = "7") -> None: + super().__init__(nfft=nfft, fs=fs) + + def get_band_specs(self): + below500 = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split500, bandwidth_hz=50 + ) + below1k = self.get_band_specs_with_bandwidth( + start_index=self.split500, end_index=self.split1k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split4k, bandwidth_hz=500 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=1000 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=2000 + ) + above16k = [(self.split16k, self.max_index)] + + return below500 + below1k + below4k + below8k + below16k + above16k + + +class DrumBandsplitSpecification(BandsplitSpecification): + def __init__(self, nfft: int, fs: int) -> None: + super().__init__(nfft=nfft, fs=fs) + + def get_band_specs(self): + below1k = self.get_band_specs_with_bandwidth( + start_index=0, end_index=self.split1k, bandwidth_hz=50 + ) + below2k = self.get_band_specs_with_bandwidth( + start_index=self.split1k, end_index=self.split2k, bandwidth_hz=100 + ) + below4k = self.get_band_specs_with_bandwidth( + start_index=self.split2k, end_index=self.split4k, bandwidth_hz=250 + ) + below8k = self.get_band_specs_with_bandwidth( + start_index=self.split4k, end_index=self.split8k, bandwidth_hz=500 + ) + below16k = self.get_band_specs_with_bandwidth( + start_index=self.split8k, end_index=self.split16k, bandwidth_hz=1000 + ) + above16k = [(self.split16k, self.max_index)] + + return below1k + below2k + below4k + below8k + below16k + above16k + + +class PerceptualBandsplitSpecification(BandsplitSpecification): + def __init__( + self, + nfft: int, + fs: int, + fbank_fn: Callable[[int, int, float, float, int], torch.Tensor], + n_bands: int, + f_min: float = 0.0, + f_max: float = None, + ) -> None: + super().__init__(nfft=nfft, fs=fs) + self.n_bands = n_bands + if f_max is None: + f_max = fs / 2 + + self.filterbank = fbank_fn(n_bands, fs, f_min, f_max, self.max_index) + + weight_per_bin = torch.sum(self.filterbank, dim=0, keepdim=True) # (1, n_freqs) + normalized_mel_fb = self.filterbank / weight_per_bin # (n_mels, n_freqs) + + freq_weights = [] + band_specs = [] + for i in range(self.n_bands): + active_bins = torch.nonzero(self.filterbank[i, :]).squeeze().tolist() + if isinstance(active_bins, int): + active_bins = (active_bins, active_bins) + if len(active_bins) == 0: + continue + start_index = active_bins[0] + end_index = active_bins[-1] + 1 + band_specs.append((start_index, end_index)) + freq_weights.append(normalized_mel_fb[i, start_index:end_index]) + + self.freq_weights = freq_weights + self.band_specs = band_specs + + def get_band_specs(self): + return self.band_specs + + def get_freq_weights(self): + return self.freq_weights + + def save_to_file(self, dir_path: str) -> None: + os.makedirs(dir_path, exist_ok=True) + + import pickle + + with open(os.path.join(dir_path, "mel_bandsplit_spec.pkl"), "wb") as f: + pickle.dump( + { + "band_specs": self.band_specs, + "freq_weights": self.freq_weights, + "filterbank": self.filterbank, + }, + f, + ) + + +def mel_filterbank(n_bands, fs, f_min, f_max, n_freqs): + fb = taF.melscale_fbanks( + n_mels=n_bands, + sample_rate=fs, + f_min=f_min, + f_max=f_max, + n_freqs=n_freqs, + ).T + + fb[0, 0] = 1.0 + + return fb + + +class MelBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=mel_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +def musical_filterbank(n_bands, fs, f_min, f_max, n_freqs, scale="constant"): + nfft = 2 * (n_freqs - 1) + df = fs / nfft + # init freqs + f_max = f_max or fs / 2 + f_min = f_min or 0 + f_min = fs / nfft + + n_octaves = np.log2(f_max / f_min) + n_octaves_per_band = n_octaves / n_bands + bandwidth_mult = np.power(2.0, n_octaves_per_band) + + low_midi = max(0, hz_to_midi(f_min)) + high_midi = hz_to_midi(f_max) + midi_points = np.linspace(low_midi, high_midi, n_bands) + hz_pts = midi_to_hz(midi_points) + + low_pts = hz_pts / bandwidth_mult + high_pts = hz_pts * bandwidth_mult + + low_bins = np.floor(low_pts / df).astype(int) + high_bins = np.ceil(high_pts / df).astype(int) + + fb = np.zeros((n_bands, n_freqs)) + + for i in range(n_bands): + fb[i, low_bins[i] : high_bins[i] + 1] = 1.0 + + fb[0, : low_bins[0]] = 1.0 + fb[-1, high_bins[-1] + 1 :] = 1.0 + + return torch.as_tensor(fb) + + +class MusicalBandsplitSpecification(PerceptualBandsplitSpecification): + def __init__( + self, nfft: int, fs: int, n_bands: int, f_min: float = 0.0, f_max: float = None + ) -> None: + super().__init__( + fbank_fn=musical_filterbank, + nfft=nfft, + fs=fs, + n_bands=n_bands, + f_min=f_min, + f_max=f_max, + ) + + +# def bark_filterbank( +# n_bands, fs, f_min, f_max, n_freqs +# ): +# nfft = 2 * (n_freqs -1) +# fb, _ = bark_fbanks.bark_filter_banks( +# nfilts=n_bands, +# nfft=nfft, +# fs=fs, +# low_freq=f_min, +# high_freq=f_max, +# scale="constant" +# ) + +# return torch.as_tensor(fb) + +# class BarkBandsplitSpecification(PerceptualBandsplitSpecification): +# def __init__( +# self, +# nfft: int, +# fs: int, +# n_bands: int, +# f_min: float = 0.0, +# f_max: float = None +# ) -> None: +# super().__init__(fbank_fn=bark_filterbank, nfft=nfft, fs=fs, n_bands=n_bands, f_min=f_min, f_max=f_max) + + +# def triangular_bark_filterbank( +# n_bands, fs, f_min, f_max, n_freqs +# ): + +# all_freqs = torch.linspace(0, fs // 2, n_freqs) + +# # calculate mel freq bins +# m_min = hz2bark(f_min) +# m_max = hz2bark(f_max) + +# m_pts = torch.linspace(m_min, m_max, n_bands + 2) +# f_pts = 600 * torch.sinh(m_pts / 6) + +# # create filterbank +# fb = _create_triangular_filterbank(all_freqs, f_pts) + +# fb = fb.T + +# first_active_band = torch.nonzero(torch.sum(fb, dim=-1))[0, 0] +# first_active_bin = torch.nonzero(fb[first_active_band, :])[0, 0] + +# fb[first_active_band, :first_active_bin] = 1.0 + +# return fb + +# class TriangularBarkBandsplitSpecification(PerceptualBandsplitSpecification): +# def __init__( +# self, +# nfft: int, +# fs: int, +# n_bands: int, +# f_min: float = 0.0, +# f_max: float = None +# ) -> None: +# super().__init__(fbank_fn=triangular_bark_filterbank, nfft=nfft, fs=fs, n_bands=n_bands, f_min=f_min, f_max=f_max) + + +# def minibark_filterbank( +# n_bands, fs, f_min, f_max, n_freqs +# ): +# fb = bark_filterbank( +# n_bands, +# fs, +# f_min, +# f_max, +# n_freqs +# ) + +# fb[fb < np.sqrt(0.5)] = 0.0 + +# return fb + +# class MiniBarkBandsplitSpecification(PerceptualBandsplitSpecification): +# def __init__( +# self, +# nfft: int, +# fs: int, +# n_bands: int, +# f_min: float = 0.0, +# f_max: float = None +# ) -> None: +# super().__init__(fbank_fn=minibark_filterbank, nfft=nfft, fs=fs, n_bands=n_bands, f_min=f_min, f_max=f_max) + + +# def erb_filterbank( +# n_bands: int, +# fs: int, +# f_min: float, +# f_max: float, +# n_freqs: int, +# ) -> Tensor: +# # freq bins +# A = (1000 * np.log(10)) / (24.7 * 4.37) +# all_freqs = torch.linspace(0, fs // 2, n_freqs) + +# # calculate mel freq bins +# m_min = hz2erb(f_min) +# m_max = hz2erb(f_max) + +# m_pts = torch.linspace(m_min, m_max, n_bands + 2) +# f_pts = (torch.pow(10, (m_pts / A)) - 1)/ 0.00437 + +# # create filterbank +# fb = _create_triangular_filterbank(all_freqs, f_pts) + +# fb = fb.T + + +# first_active_band = torch.nonzero(torch.sum(fb, dim=-1))[0, 0] +# first_active_bin = torch.nonzero(fb[first_active_band, :])[0, 0] + +# fb[first_active_band, :first_active_bin] = 1.0 + +# return fb + + +# class EquivalentRectangularBandsplitSpecification(PerceptualBandsplitSpecification): +# def __init__( +# self, +# nfft: int, +# fs: int, +# n_bands: int, +# f_min: float = 0.0, +# f_max: float = None +# ) -> None: +# super().__init__(fbank_fn=erb_filterbank, nfft=nfft, fs=fs, n_bands=n_bands, f_min=f_min, f_max=f_max) + +if __name__ == "__main__": + import pandas as pd + + band_defs = [] + + for bands in [VocalBandsplitSpecification]: + band_name = bands.__name__.replace("BandsplitSpecification", "") + + mbs = bands(nfft=2048, fs=44100).get_band_specs() + + for i, (f_min, f_max) in enumerate(mbs): + band_defs.append( + {"band": band_name, "band_index": i, "f_min": f_min, "f_max": f_max} + ) + + df = pd.DataFrame(band_defs) + df.to_csv("vox7bands.csv", index=False) diff --git a/programs/music_separation_code/models/bs_roformer/__init__.py b/programs/music_separation_code/models/bs_roformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..980e0afa5b7b4fd66168bce6905a94e7c91c380e --- /dev/null +++ b/programs/music_separation_code/models/bs_roformer/__init__.py @@ -0,0 +1,2 @@ +from models.bs_roformer.bs_roformer import BSRoformer +from models.bs_roformer.mel_band_roformer import MelBandRoformer diff --git a/programs/music_separation_code/models/bs_roformer/attend.py b/programs/music_separation_code/models/bs_roformer/attend.py new file mode 100644 index 0000000000000000000000000000000000000000..9ebb7c937268ff4568ef6dbdcbc90abbfc8f0887 --- /dev/null +++ b/programs/music_separation_code/models/bs_roformer/attend.py @@ -0,0 +1,144 @@ +from functools import wraps +from packaging import version +from collections import namedtuple + +import os +import torch +from torch import nn, einsum +import torch.nn.functional as F + +from einops import rearrange, reduce + +# constants + +FlashAttentionConfig = namedtuple( + "FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"] +) + +# helpers + + +def exists(val): + return val is not None + + +def default(v, d): + return v if exists(v) else d + + +def once(fn): + called = False + + @wraps(fn) + def inner(x): + nonlocal called + if called: + return + called = True + return fn(x) + + return inner + + +print_once = once(print) + +# main class + + +class Attend(nn.Module): + def __init__(self, dropout=0.0, flash=False, scale=None): + super().__init__() + self.scale = scale + self.dropout = dropout + self.attn_dropout = nn.Dropout(dropout) + + self.flash = flash + assert not ( + flash and version.parse(torch.__version__) < version.parse("2.0.0") + ), "in order to use flash attention, you must be using pytorch 2.0 or above" + + # determine efficient attention configs for cuda and cpu + + self.cpu_config = FlashAttentionConfig(True, True, True) + self.cuda_config = None + + if not torch.cuda.is_available() or not flash: + return + + device_properties = torch.cuda.get_device_properties(torch.device("cuda")) + device_version = version.parse( + f"{device_properties.major}.{device_properties.minor}" + ) + + if device_version >= version.parse("8.0"): + if os.name == "nt": + print_once( + "Windows OS detected, using math or mem efficient attention if input tensor is on cuda" + ) + self.cuda_config = FlashAttentionConfig(False, True, True) + else: + print_once( + "GPU Compute Capability equal or above 8.0, using flash attention if input tensor is on cuda" + ) + self.cuda_config = FlashAttentionConfig(True, False, False) + else: + print_once( + "GPU Compute Capability below 8.0, using math or mem efficient attention if input tensor is on cuda" + ) + self.cuda_config = FlashAttentionConfig(False, True, True) + + def flash_attn(self, q, k, v): + _, heads, q_len, _, k_len, is_cuda, device = ( + *q.shape, + k.shape[-2], + q.is_cuda, + q.device, + ) + + if exists(self.scale): + default_scale = q.shape[-1] ** -0.5 + q = q * (self.scale / default_scale) + + # Check if there is a compatible device for flash attention + + config = self.cuda_config if is_cuda else self.cpu_config + + # pytorch 2.0 flash attn: q, k, v, mask, dropout, softmax_scale + + with torch.backends.cuda.sdp_kernel(**config._asdict()): + out = F.scaled_dot_product_attention( + q, k, v, dropout_p=self.dropout if self.training else 0.0 + ) + + return out + + def forward(self, q, k, v): + """ + einstein notation + b - batch + h - heads + n, i, j - sequence length (base sequence length, source, target) + d - feature dimension + """ + + q_len, k_len, device = q.shape[-2], k.shape[-2], q.device + + scale = default(self.scale, q.shape[-1] ** -0.5) + + if self.flash: + return self.flash_attn(q, k, v) + + # similarity + + sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale + + # attention + + attn = sim.softmax(dim=-1) + attn = self.attn_dropout(attn) + + # aggregate values + + out = einsum(f"b h i j, b h j d -> b h i d", attn, v) + + return out diff --git a/programs/music_separation_code/models/bs_roformer/bs_roformer.py b/programs/music_separation_code/models/bs_roformer/bs_roformer.py new file mode 100644 index 0000000000000000000000000000000000000000..3ed154450d8b7366635159027c02f81f2d3fef44 --- /dev/null +++ b/programs/music_separation_code/models/bs_roformer/bs_roformer.py @@ -0,0 +1,669 @@ +from functools import partial + +import torch +from torch import nn, einsum, Tensor +from torch.nn import Module, ModuleList +import torch.nn.functional as F + +from models.bs_roformer.attend import Attend + +from beartype.typing import Tuple, Optional, List, Callable +from beartype import beartype + +from rotary_embedding_torch import RotaryEmbedding + +from einops import rearrange, pack, unpack +from einops.layers.torch import Rearrange + +# helper functions + + +def exists(val): + return val is not None + + +def default(v, d): + return v if exists(v) else d + + +def pack_one(t, pattern): + return pack([t], pattern) + + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + + +# norm + + +def l2norm(t): + return F.normalize(t, dim=-1, p=2) + + +class RMSNorm(Module): + def __init__(self, dim): + super().__init__() + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + return F.normalize(x, dim=-1) * self.scale * self.gamma + + +# attention + + +class FeedForward(Module): + def __init__(self, dim, mult=4, dropout=0.0): + super().__init__() + dim_inner = int(dim * mult) + self.net = nn.Sequential( + RMSNorm(dim), + nn.Linear(dim, dim_inner), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(dim_inner, dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +class Attention(Module): + def __init__( + self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True + ): + super().__init__() + self.heads = heads + self.scale = dim_head**-0.5 + dim_inner = heads * dim_head + + self.rotary_embed = rotary_embed + + self.attend = Attend(flash=flash, dropout=dropout) + + self.norm = RMSNorm(dim) + self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False) + + self.to_gates = nn.Linear(dim, heads) + + self.to_out = nn.Sequential( + nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout) + ) + + def forward(self, x): + x = self.norm(x) + + q, k, v = rearrange( + self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads + ) + + if exists(self.rotary_embed): + q = self.rotary_embed.rotate_queries_or_keys(q) + k = self.rotary_embed.rotate_queries_or_keys(k) + + out = self.attend(q, k, v) + + gates = self.to_gates(x) + out = out * rearrange(gates, "b n h -> b h n 1").sigmoid() + + out = rearrange(out, "b h n d -> b n (h d)") + return self.to_out(out) + + +class LinearAttention(Module): + """ + this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al. + """ + + @beartype + def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0): + super().__init__() + dim_inner = dim_head * heads + self.norm = RMSNorm(dim) + + self.to_qkv = nn.Sequential( + nn.Linear(dim, dim_inner * 3, bias=False), + Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads), + ) + + self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) + + self.attend = Attend(scale=scale, dropout=dropout, flash=flash) + + self.to_out = nn.Sequential( + Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False) + ) + + def forward(self, x): + x = self.norm(x) + + q, k, v = self.to_qkv(x) + + q, k = map(l2norm, (q, k)) + q = q * self.temperature.exp() + + out = self.attend(q, k, v) + + return self.to_out(out) + + +class Transformer(Module): + def __init__( + self, + *, + dim, + depth, + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + ff_mult=4, + norm_output=True, + rotary_embed=None, + flash_attn=True, + linear_attn=False, + ): + super().__init__() + self.layers = ModuleList([]) + + for _ in range(depth): + if linear_attn: + attn = LinearAttention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + flash=flash_attn, + ) + else: + attn = Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + rotary_embed=rotary_embed, + flash=flash_attn, + ) + + self.layers.append( + ModuleList( + [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)] + ) + ) + + self.norm = RMSNorm(dim) if norm_output else nn.Identity() + + def forward(self, x): + + for attn, ff in self.layers: + x = attn(x) + x + x = ff(x) + x + + return self.norm(x) + + +# bandsplit module + + +class BandSplit(Module): + @beartype + def __init__(self, dim, dim_inputs: Tuple[int, ...]): + super().__init__() + self.dim_inputs = dim_inputs + self.to_features = ModuleList([]) + + for dim_in in dim_inputs: + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) + + self.to_features.append(net) + + def forward(self, x): + x = x.split(self.dim_inputs, dim=-1) + + outs = [] + for split_input, to_feature in zip(x, self.to_features): + split_output = to_feature(split_input) + outs.append(split_output) + + return torch.stack(outs, dim=-2) + + +def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh): + dim_hidden = default(dim_hidden, dim_in) + + net = [] + dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out) + + for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])): + is_last = ind == (len(dims) - 2) + + net.append(nn.Linear(layer_dim_in, layer_dim_out)) + + if is_last: + continue + + net.append(activation()) + + return nn.Sequential(*net) + + +class MaskEstimator(Module): + @beartype + def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4): + super().__init__() + self.dim_inputs = dim_inputs + self.to_freqs = ModuleList([]) + dim_hidden = dim * mlp_expansion_factor + + for dim_in in dim_inputs: + net = [] + + mlp = nn.Sequential( + MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1) + ) + + self.to_freqs.append(mlp) + + def forward(self, x): + x = x.unbind(dim=-2) + + outs = [] + + for band_features, mlp in zip(x, self.to_freqs): + freq_out = mlp(band_features) + outs.append(freq_out) + + return torch.cat(outs, dim=-1) + + +# main class + +DEFAULT_FREQS_PER_BANDS = ( + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 2, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 4, + 12, + 12, + 12, + 12, + 12, + 12, + 12, + 12, + 24, + 24, + 24, + 24, + 24, + 24, + 24, + 24, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 128, + 129, +) + + +class BSRoformer(Module): + + @beartype + def __init__( + self, + dim, + *, + depth, + stereo=False, + num_stems=1, + time_transformer_depth=2, + freq_transformer_depth=2, + linear_transformer_depth=0, + freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS, + # in the paper, they divide into ~60 bands, test with 1 for starters + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + flash_attn=True, + dim_freqs_in=1025, + stft_n_fft=2048, + stft_hop_length=512, + # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction + stft_win_length=2048, + stft_normalized=False, + stft_window_fn: Optional[Callable] = None, + mask_estimator_depth=2, + multi_stft_resolution_loss_weight=1.0, + multi_stft_resolutions_window_sizes: Tuple[int, ...] = ( + 4096, + 2048, + 1024, + 512, + 256, + ), + multi_stft_hop_size=147, + multi_stft_normalized=False, + multi_stft_window_fn: Callable = torch.hann_window, + ): + super().__init__() + + self.stereo = stereo + self.audio_channels = 2 if stereo else 1 + self.num_stems = num_stems + + self.layers = ModuleList([]) + + transformer_kwargs = dict( + dim=dim, + heads=heads, + dim_head=dim_head, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, + flash_attn=flash_attn, + norm_output=False, + ) + + time_rotary_embed = RotaryEmbedding(dim=dim_head) + freq_rotary_embed = RotaryEmbedding(dim=dim_head) + + for _ in range(depth): + tran_modules = [] + if linear_transformer_depth > 0: + tran_modules.append( + Transformer( + depth=linear_transformer_depth, + linear_attn=True, + **transformer_kwargs, + ) + ) + tran_modules.append( + Transformer( + depth=time_transformer_depth, + rotary_embed=time_rotary_embed, + **transformer_kwargs, + ) + ) + tran_modules.append( + Transformer( + depth=freq_transformer_depth, + rotary_embed=freq_rotary_embed, + **transformer_kwargs, + ) + ) + self.layers.append(nn.ModuleList(tran_modules)) + + self.final_norm = RMSNorm(dim) + + self.stft_kwargs = dict( + n_fft=stft_n_fft, + hop_length=stft_hop_length, + win_length=stft_win_length, + normalized=stft_normalized, + ) + + self.stft_window_fn = partial( + default(stft_window_fn, torch.hann_window), stft_win_length + ) + + freqs = torch.stft( + torch.randn(1, 4096), + **self.stft_kwargs, + window=torch.ones(stft_n_fft), + return_complex=True, + ).shape[1] + + assert len(freqs_per_bands) > 1 + assert ( + sum(freqs_per_bands) == freqs + ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}" + + freqs_per_bands_with_complex = tuple( + 2 * f * self.audio_channels for f in freqs_per_bands + ) + + self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex) + + self.mask_estimators = nn.ModuleList([]) + + for _ in range(num_stems): + mask_estimator = MaskEstimator( + dim=dim, + dim_inputs=freqs_per_bands_with_complex, + depth=mask_estimator_depth, + ) + + self.mask_estimators.append(mask_estimator) + + # for the multi-resolution stft loss + + self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight + self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes + self.multi_stft_n_fft = stft_n_fft + self.multi_stft_window_fn = multi_stft_window_fn + + self.multi_stft_kwargs = dict( + hop_length=multi_stft_hop_size, normalized=multi_stft_normalized + ) + + def forward(self, raw_audio, target=None, return_loss_breakdown=False): + """ + einops + + b - batch + f - freq + t - time + s - audio channel (1 for mono, 2 for stereo) + n - number of 'stems' + c - complex (2) + d - feature dimension + """ + + device = raw_audio.device + + # defining whether model is loaded on MPS (MacOS GPU accelerator) + x_is_mps = True if device.type == "mps" else False + + if raw_audio.ndim == 2: + raw_audio = rearrange(raw_audio, "b t -> b 1 t") + + channels = raw_audio.shape[1] + assert (not self.stereo and channels == 1) or ( + self.stereo and channels == 2 + ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)" + + # to stft + + raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t") + + stft_window = self.stft_window_fn(device=device) + + # RuntimeError: FFT operations are only supported on MacOS 14+ + # Since it's tedious to define whether we're on correct MacOS version - simple try-catch is used + try: + stft_repr = torch.stft( + raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True + ) + except: + stft_repr = torch.stft( + raw_audio.cpu() if x_is_mps else raw_audio, + **self.stft_kwargs, + window=stft_window.cpu() if x_is_mps else stft_window, + return_complex=True, + ).to(device) + + stft_repr = torch.view_as_real(stft_repr) + + stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c") + stft_repr = rearrange( + stft_repr, "b s f t c -> b (f s) t c" + ) # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting + + x = rearrange(stft_repr, "b f t c -> b t (f c)") + + x = self.band_split(x) + + # axial / hierarchical attention + + for transformer_block in self.layers: + + if len(transformer_block) == 3: + linear_transformer, time_transformer, freq_transformer = ( + transformer_block + ) + + x, ft_ps = pack([x], "b * d") + x = linear_transformer(x) + (x,) = unpack(x, ft_ps, "b * d") + else: + time_transformer, freq_transformer = transformer_block + + x = rearrange(x, "b t f d -> b f t d") + x, ps = pack([x], "* t d") + + x = time_transformer(x) + + (x,) = unpack(x, ps, "* t d") + x = rearrange(x, "b f t d -> b t f d") + x, ps = pack([x], "* f d") + + x = freq_transformer(x) + + (x,) = unpack(x, ps, "* f d") + + x = self.final_norm(x) + + num_stems = len(self.mask_estimators) + + mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) + mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2) + + # modulate frequency representation + + stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c") + + # complex number multiplication + + stft_repr = torch.view_as_complex(stft_repr) + mask = torch.view_as_complex(mask) + + stft_repr = stft_repr * mask + + # istft + + stft_repr = rearrange( + stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels + ) + + # same as torch.stft() fix for MacOS MPS above + try: + recon_audio = torch.istft( + stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False + ) + except: + recon_audio = torch.istft( + stft_repr.cpu() if x_is_mps else stft_repr, + **self.stft_kwargs, + window=stft_window.cpu() if x_is_mps else stft_window, + return_complex=False, + ).to(device) + + recon_audio = rearrange( + recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems + ) + + if num_stems == 1: + recon_audio = rearrange(recon_audio, "b 1 s t -> b s t") + + # if a target is passed in, calculate loss for learning + + if not exists(target): + return recon_audio + + if self.num_stems > 1: + assert target.ndim == 4 and target.shape[1] == self.num_stems + + if target.ndim == 2: + target = rearrange(target, "... t -> ... 1 t") + + target = target[ + ..., : recon_audio.shape[-1] + ] # protect against lost length on istft + + loss = F.l1_loss(recon_audio, target) + + multi_stft_resolution_loss = 0.0 + + for window_size in self.multi_stft_resolutions_window_sizes: + res_stft_kwargs = dict( + n_fft=max( + window_size, self.multi_stft_n_fft + ), # not sure what n_fft is across multi resolution stft + win_length=window_size, + return_complex=True, + window=self.multi_stft_window_fn(window_size, device=device), + **self.multi_stft_kwargs, + ) + + recon_Y = torch.stft( + rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs + ) + target_Y = torch.stft( + rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs + ) + + multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss( + recon_Y, target_Y + ) + + weighted_multi_resolution_loss = ( + multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight + ) + + total_loss = loss + weighted_multi_resolution_loss + + if not return_loss_breakdown: + return total_loss + + return total_loss, (loss, multi_stft_resolution_loss) diff --git a/programs/music_separation_code/models/bs_roformer/mel_band_roformer.py b/programs/music_separation_code/models/bs_roformer/mel_band_roformer.py new file mode 100644 index 0000000000000000000000000000000000000000..105ced1548fda83ab939de5699d7d79ee7c762fe --- /dev/null +++ b/programs/music_separation_code/models/bs_roformer/mel_band_roformer.py @@ -0,0 +1,671 @@ +from functools import partial + +import torch +from torch import nn, einsum, Tensor +from torch.nn import Module, ModuleList +import torch.nn.functional as F + +from models.bs_roformer.attend import Attend + +from beartype.typing import Tuple, Optional, List, Callable +from beartype import beartype + +from rotary_embedding_torch import RotaryEmbedding + +from einops import rearrange, pack, unpack, reduce, repeat +from einops.layers.torch import Rearrange + +from librosa import filters + + +# helper functions + + +def exists(val): + return val is not None + + +def default(v, d): + return v if exists(v) else d + + +def pack_one(t, pattern): + return pack([t], pattern) + + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + + +def pad_at_dim(t, pad, dim=-1, value=0.0): + dims_from_right = (-dim - 1) if dim < 0 else (t.ndim - dim - 1) + zeros = (0, 0) * dims_from_right + return F.pad(t, (*zeros, *pad), value=value) + + +def l2norm(t): + return F.normalize(t, dim=-1, p=2) + + +# norm + + +class RMSNorm(Module): + def __init__(self, dim): + super().__init__() + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + return F.normalize(x, dim=-1) * self.scale * self.gamma + + +# attention + + +class FeedForward(Module): + def __init__(self, dim, mult=4, dropout=0.0): + super().__init__() + dim_inner = int(dim * mult) + self.net = nn.Sequential( + RMSNorm(dim), + nn.Linear(dim, dim_inner), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(dim_inner, dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +class Attention(Module): + def __init__( + self, dim, heads=8, dim_head=64, dropout=0.0, rotary_embed=None, flash=True + ): + super().__init__() + self.heads = heads + self.scale = dim_head**-0.5 + dim_inner = heads * dim_head + + self.rotary_embed = rotary_embed + + self.attend = Attend(flash=flash, dropout=dropout) + + self.norm = RMSNorm(dim) + self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False) + + self.to_gates = nn.Linear(dim, heads) + + self.to_out = nn.Sequential( + nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout) + ) + + def forward(self, x): + x = self.norm(x) + + q, k, v = rearrange( + self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads + ) + + if exists(self.rotary_embed): + q = self.rotary_embed.rotate_queries_or_keys(q) + k = self.rotary_embed.rotate_queries_or_keys(k) + + out = self.attend(q, k, v) + + gates = self.to_gates(x) + out = out * rearrange(gates, "b n h -> b h n 1").sigmoid() + + out = rearrange(out, "b h n d -> b n (h d)") + return self.to_out(out) + + +class LinearAttention(Module): + """ + this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al. + """ + + @beartype + def __init__(self, *, dim, dim_head=32, heads=8, scale=8, flash=False, dropout=0.0): + super().__init__() + dim_inner = dim_head * heads + self.norm = RMSNorm(dim) + + self.to_qkv = nn.Sequential( + nn.Linear(dim, dim_inner * 3, bias=False), + Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads), + ) + + self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) + + self.attend = Attend(scale=scale, dropout=dropout, flash=flash) + + self.to_out = nn.Sequential( + Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False) + ) + + def forward(self, x): + x = self.norm(x) + + q, k, v = self.to_qkv(x) + + q, k = map(l2norm, (q, k)) + q = q * self.temperature.exp() + + out = self.attend(q, k, v) + + return self.to_out(out) + + +class Transformer(Module): + def __init__( + self, + *, + dim, + depth, + dim_head=64, + heads=8, + attn_dropout=0.0, + ff_dropout=0.0, + ff_mult=4, + norm_output=True, + rotary_embed=None, + flash_attn=True, + linear_attn=False, + ): + super().__init__() + self.layers = ModuleList([]) + + for _ in range(depth): + if linear_attn: + attn = LinearAttention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + flash=flash_attn, + ) + else: + attn = Attention( + dim=dim, + dim_head=dim_head, + heads=heads, + dropout=attn_dropout, + rotary_embed=rotary_embed, + flash=flash_attn, + ) + + self.layers.append( + ModuleList( + [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)] + ) + ) + + self.norm = RMSNorm(dim) if norm_output else nn.Identity() + + def forward(self, x): + + for attn, ff in self.layers: + x = attn(x) + x + x = ff(x) + x + + return self.norm(x) + + +# bandsplit module + + +class BandSplit(Module): + @beartype + def __init__(self, dim, dim_inputs: Tuple[int, ...]): + super().__init__() + self.dim_inputs = dim_inputs + self.to_features = ModuleList([]) + + for dim_in in dim_inputs: + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) + + self.to_features.append(net) + + def forward(self, x): + x = x.split(self.dim_inputs, dim=-1) + + outs = [] + for split_input, to_feature in zip(x, self.to_features): + split_output = to_feature(split_input) + outs.append(split_output) + + return torch.stack(outs, dim=-2) + + +def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh): + dim_hidden = default(dim_hidden, dim_in) + + net = [] + dims = (dim_in, *((dim_hidden,) * depth), dim_out) + + for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])): + is_last = ind == (len(dims) - 2) + + net.append(nn.Linear(layer_dim_in, layer_dim_out)) + + if is_last: + continue + + net.append(activation()) + + return nn.Sequential(*net) + + +class MaskEstimator(Module): + @beartype + def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4): + super().__init__() + self.dim_inputs = dim_inputs + self.to_freqs = ModuleList([]) + dim_hidden = dim * mlp_expansion_factor + + for dim_in in dim_inputs: + net = [] + + mlp = nn.Sequential( + MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1) + ) + + self.to_freqs.append(mlp) + + def forward(self, x): + x = x.unbind(dim=-2) + + outs = [] + + for band_features, mlp in zip(x, self.to_freqs): + freq_out = mlp(band_features) + outs.append(freq_out) + + return torch.cat(outs, dim=-1) + + +# main class + + +class MelBandRoformer(Module): + + @beartype + def __init__( + self, + dim, + *, + depth, + stereo=False, + num_stems=1, + time_transformer_depth=2, + freq_transformer_depth=2, + linear_transformer_depth=0, + num_bands=60, + dim_head=64, + heads=8, + attn_dropout=0.1, + ff_dropout=0.1, + flash_attn=True, + dim_freqs_in=1025, + sample_rate=44100, # needed for mel filter bank from librosa + stft_n_fft=2048, + stft_hop_length=512, + # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction + stft_win_length=2048, + stft_normalized=False, + stft_window_fn: Optional[Callable] = None, + mask_estimator_depth=1, + multi_stft_resolution_loss_weight=1.0, + multi_stft_resolutions_window_sizes: Tuple[int, ...] = ( + 4096, + 2048, + 1024, + 512, + 256, + ), + multi_stft_hop_size=147, + multi_stft_normalized=False, + multi_stft_window_fn: Callable = torch.hann_window, + match_input_audio_length=False, # if True, pad output tensor to match length of input tensor + ): + super().__init__() + + self.stereo = stereo + self.audio_channels = 2 if stereo else 1 + self.num_stems = num_stems + + self.layers = ModuleList([]) + + transformer_kwargs = dict( + dim=dim, + heads=heads, + dim_head=dim_head, + attn_dropout=attn_dropout, + ff_dropout=ff_dropout, + flash_attn=flash_attn, + ) + + time_rotary_embed = RotaryEmbedding(dim=dim_head) + freq_rotary_embed = RotaryEmbedding(dim=dim_head) + + for _ in range(depth): + tran_modules = [] + if linear_transformer_depth > 0: + tran_modules.append( + Transformer( + depth=linear_transformer_depth, + linear_attn=True, + **transformer_kwargs, + ) + ) + tran_modules.append( + Transformer( + depth=time_transformer_depth, + rotary_embed=time_rotary_embed, + **transformer_kwargs, + ) + ) + tran_modules.append( + Transformer( + depth=freq_transformer_depth, + rotary_embed=freq_rotary_embed, + **transformer_kwargs, + ) + ) + self.layers.append(nn.ModuleList(tran_modules)) + + self.stft_window_fn = partial( + default(stft_window_fn, torch.hann_window), stft_win_length + ) + + self.stft_kwargs = dict( + n_fft=stft_n_fft, + hop_length=stft_hop_length, + win_length=stft_win_length, + normalized=stft_normalized, + ) + + freqs = torch.stft( + torch.randn(1, 4096), + **self.stft_kwargs, + window=torch.ones(stft_n_fft), + return_complex=True, + ).shape[1] + + # create mel filter bank + # with librosa.filters.mel as in section 2 of paper + + mel_filter_bank_numpy = filters.mel( + sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands + ) + + mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy) + + # for some reason, it doesn't include the first freq? just force a value for now + + mel_filter_bank[0][0] = 1.0 + + # In some systems/envs we get 0.0 instead of ~1.9e-18 in the last position, + # so let's force a positive value + + mel_filter_bank[-1, -1] = 1.0 + + # binary as in paper (then estimated masks are averaged for overlapping regions) + + freqs_per_band = mel_filter_bank > 0 + assert freqs_per_band.any( + dim=0 + ).all(), "all frequencies need to be covered by all bands for now" + + repeated_freq_indices = repeat(torch.arange(freqs), "f -> b f", b=num_bands) + freq_indices = repeated_freq_indices[freqs_per_band] + + if stereo: + freq_indices = repeat(freq_indices, "f -> f s", s=2) + freq_indices = freq_indices * 2 + torch.arange(2) + freq_indices = rearrange(freq_indices, "f s -> (f s)") + + self.register_buffer("freq_indices", freq_indices, persistent=False) + self.register_buffer("freqs_per_band", freqs_per_band, persistent=False) + + num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum") + num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum") + + self.register_buffer("num_freqs_per_band", num_freqs_per_band, persistent=False) + self.register_buffer("num_bands_per_freq", num_bands_per_freq, persistent=False) + + # band split and mask estimator + + freqs_per_bands_with_complex = tuple( + 2 * f * self.audio_channels for f in num_freqs_per_band.tolist() + ) + + self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex) + + self.mask_estimators = nn.ModuleList([]) + + for _ in range(num_stems): + mask_estimator = MaskEstimator( + dim=dim, + dim_inputs=freqs_per_bands_with_complex, + depth=mask_estimator_depth, + ) + + self.mask_estimators.append(mask_estimator) + + # for the multi-resolution stft loss + + self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight + self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes + self.multi_stft_n_fft = stft_n_fft + self.multi_stft_window_fn = multi_stft_window_fn + + self.multi_stft_kwargs = dict( + hop_length=multi_stft_hop_size, normalized=multi_stft_normalized + ) + + self.match_input_audio_length = match_input_audio_length + + def forward(self, raw_audio, target=None, return_loss_breakdown=False): + """ + einops + + b - batch + f - freq + t - time + s - audio channel (1 for mono, 2 for stereo) + n - number of 'stems' + c - complex (2) + d - feature dimension + """ + + device = raw_audio.device + + if raw_audio.ndim == 2: + raw_audio = rearrange(raw_audio, "b t -> b 1 t") + + batch, channels, raw_audio_length = raw_audio.shape + + istft_length = raw_audio_length if self.match_input_audio_length else None + + assert (not self.stereo and channels == 1) or ( + self.stereo and channels == 2 + ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)" + + # to stft + + raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t") + + stft_window = self.stft_window_fn(device=device) + + stft_repr = torch.stft( + raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True + ) + stft_repr = torch.view_as_real(stft_repr) + + stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c") + stft_repr = rearrange( + stft_repr, "b s f t c -> b (f s) t c" + ) # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting + + # index out all frequencies for all frequency ranges across bands ascending in one go + + batch_arange = torch.arange(batch, device=device)[..., None] + + # account for stereo + + x = stft_repr[batch_arange, self.freq_indices] + + # fold the complex (real and imag) into the frequencies dimension + + x = rearrange(x, "b f t c -> b t (f c)") + + x = self.band_split(x) + + # axial / hierarchical attention + + for transformer_block in self.layers: + + if len(transformer_block) == 3: + linear_transformer, time_transformer, freq_transformer = ( + transformer_block + ) + + x, ft_ps = pack([x], "b * d") + x = linear_transformer(x) + (x,) = unpack(x, ft_ps, "b * d") + else: + time_transformer, freq_transformer = transformer_block + + x = rearrange(x, "b t f d -> b f t d") + x, ps = pack([x], "* t d") + + x = time_transformer(x) + + (x,) = unpack(x, ps, "* t d") + x = rearrange(x, "b f t d -> b t f d") + x, ps = pack([x], "* f d") + + x = freq_transformer(x) + + (x,) = unpack(x, ps, "* f d") + + num_stems = len(self.mask_estimators) + + masks = torch.stack([fn(x) for fn in self.mask_estimators], dim=1) + masks = rearrange(masks, "b n t (f c) -> b n f t c", c=2) + + # modulate frequency representation + + stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c") + + # complex number multiplication + + stft_repr = torch.view_as_complex(stft_repr) + masks = torch.view_as_complex(masks) + + masks = masks.type(stft_repr.dtype) + + # need to average the estimated mask for the overlapped frequencies + + scatter_indices = repeat( + self.freq_indices, + "f -> b n f t", + b=batch, + n=num_stems, + t=stft_repr.shape[-1], + ) + + stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=num_stems) + masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_( + 2, scatter_indices, masks + ) + + denom = repeat(self.num_bands_per_freq, "f -> (f r) 1", r=channels) + + masks_averaged = masks_summed / denom.clamp(min=1e-8) + + # modulate stft repr with estimated mask + + stft_repr = stft_repr * masks_averaged + + # istft + + stft_repr = rearrange( + stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels + ) + + recon_audio = torch.istft( + stft_repr, + **self.stft_kwargs, + window=stft_window, + return_complex=False, + length=istft_length, + ) + + recon_audio = rearrange( + recon_audio, + "(b n s) t -> b n s t", + b=batch, + s=self.audio_channels, + n=num_stems, + ) + + if num_stems == 1: + recon_audio = rearrange(recon_audio, "b 1 s t -> b s t") + + # if a target is passed in, calculate loss for learning + + if not exists(target): + return recon_audio + + if self.num_stems > 1: + assert target.ndim == 4 and target.shape[1] == self.num_stems + + if target.ndim == 2: + target = rearrange(target, "... t -> ... 1 t") + + target = target[ + ..., : recon_audio.shape[-1] + ] # protect against lost length on istft + + loss = F.l1_loss(recon_audio, target) + + multi_stft_resolution_loss = 0.0 + + for window_size in self.multi_stft_resolutions_window_sizes: + res_stft_kwargs = dict( + n_fft=max( + window_size, self.multi_stft_n_fft + ), # not sure what n_fft is across multi resolution stft + win_length=window_size, + return_complex=True, + window=self.multi_stft_window_fn(window_size, device=device), + **self.multi_stft_kwargs, + ) + + recon_Y = torch.stft( + rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs + ) + target_Y = torch.stft( + rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs + ) + + multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss( + recon_Y, target_Y + ) + + weighted_multi_resolution_loss = ( + multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight + ) + + total_loss = loss + weighted_multi_resolution_loss + + if not return_loss_breakdown: + return total_loss + + return total_loss, (loss, multi_stft_resolution_loss) diff --git a/programs/music_separation_code/models/demucs4ht.py b/programs/music_separation_code/models/demucs4ht.py new file mode 100644 index 0000000000000000000000000000000000000000..888ee82a19cf612b62504dfd8c0d9204f7a906f6 --- /dev/null +++ b/programs/music_separation_code/models/demucs4ht.py @@ -0,0 +1,712 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + +import numpy as np +import torch +import json +from omegaconf import OmegaConf +from demucs.demucs import Demucs +from demucs.hdemucs import HDemucs + +import math +from openunmix.filtering import wiener +from torch import nn +from torch.nn import functional as F +from fractions import Fraction +from einops import rearrange + +from demucs.transformer import CrossTransformerEncoder + +from demucs.demucs import rescale_module +from demucs.states import capture_init +from demucs.spec import spectro, ispectro +from demucs.hdemucs import pad1d, ScaledEmbedding, HEncLayer, MultiWrap, HDecLayer + + +class HTDemucs(nn.Module): + """ + Spectrogram and hybrid Demucs model. + The spectrogram model has the same structure as Demucs, except the first few layers are over the + frequency axis, until there is only 1 frequency, and then it moves to time convolutions. + Frequency layers can still access information across time steps thanks to the DConv residual. + + Hybrid model have a parallel time branch. At some layer, the time branch has the same stride + as the frequency branch and then the two are combined. The opposite happens in the decoder. + + Models can either use naive iSTFT from masking, Wiener filtering ([Ulhih et al. 2017]), + or complex as channels (CaC) [Choi et al. 2020]. Wiener filtering is based on + Open Unmix implementation [Stoter et al. 2019]. + + The loss is always on the temporal domain, by backpropagating through the above + output methods and iSTFT. This allows to define hybrid models nicely. However, this breaks + a bit Wiener filtering, as doing more iteration at test time will change the spectrogram + contribution, without changing the one from the waveform, which will lead to worse performance. + I tried using the residual option in OpenUnmix Wiener implementation, but it didn't improve. + CaC on the other hand provides similar performance for hybrid, and works naturally with + hybrid models. + + This model also uses frequency embeddings are used to improve efficiency on convolutions + over the freq. axis, following [Isik et al. 2020] (https://arxiv.org/pdf/2008.04470.pdf). + + Unlike classic Demucs, there is no resampling here, and normalization is always applied. + """ + + @capture_init + def __init__( + self, + sources, + # Channels + audio_channels=2, + channels=48, + channels_time=None, + growth=2, + # STFT + nfft=4096, + num_subbands=1, + wiener_iters=0, + end_iters=0, + wiener_residual=False, + cac=True, + # Main structure + depth=4, + rewrite=True, + # Frequency branch + multi_freqs=None, + multi_freqs_depth=3, + freq_emb=0.2, + emb_scale=10, + emb_smooth=True, + # Convolutions + kernel_size=8, + time_stride=2, + stride=4, + context=1, + context_enc=0, + # Normalization + norm_starts=4, + norm_groups=4, + # DConv residual branch + dconv_mode=1, + dconv_depth=2, + dconv_comp=8, + dconv_init=1e-3, + # Before the Transformer + bottom_channels=0, + # Transformer + t_layers=5, + t_emb="sin", + t_hidden_scale=4.0, + t_heads=8, + t_dropout=0.0, + t_max_positions=10000, + t_norm_in=True, + t_norm_in_group=False, + t_group_norm=False, + t_norm_first=True, + t_norm_out=True, + t_max_period=10000.0, + t_weight_decay=0.0, + t_lr=None, + t_layer_scale=True, + t_gelu=True, + t_weight_pos_embed=1.0, + t_sin_random_shift=0, + t_cape_mean_normalize=True, + t_cape_augment=True, + t_cape_glob_loc_scale=[5000.0, 1.0, 1.4], + t_sparse_self_attn=False, + t_sparse_cross_attn=False, + t_mask_type="diag", + t_mask_random_seed=42, + t_sparse_attn_window=500, + t_global_window=100, + t_sparsity=0.95, + t_auto_sparsity=False, + # ------ Particuliar parameters + t_cross_first=False, + # Weight init + rescale=0.1, + # Metadata + samplerate=44100, + segment=10, + use_train_segment=False, + ): + """ + Args: + sources (list[str]): list of source names. + audio_channels (int): input/output audio channels. + channels (int): initial number of hidden channels. + channels_time: if not None, use a different `channels` value for the time branch. + growth: increase the number of hidden channels by this factor at each layer. + nfft: number of fft bins. Note that changing this require careful computation of + various shape parameters and will not work out of the box for hybrid models. + wiener_iters: when using Wiener filtering, number of iterations at test time. + end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`. + wiener_residual: add residual source before wiener filtering. + cac: uses complex as channels, i.e. complex numbers are 2 channels each + in input and output. no further processing is done before ISTFT. + depth (int): number of layers in the encoder and in the decoder. + rewrite (bool): add 1x1 convolution to each layer. + multi_freqs: list of frequency ratios for splitting frequency bands with `MultiWrap`. + multi_freqs_depth: how many layers to wrap with `MultiWrap`. Only the outermost + layers will be wrapped. + freq_emb: add frequency embedding after the first frequency layer if > 0, + the actual value controls the weight of the embedding. + emb_scale: equivalent to scaling the embedding learning rate + emb_smooth: initialize the embedding with a smooth one (with respect to frequencies). + kernel_size: kernel_size for encoder and decoder layers. + stride: stride for encoder and decoder layers. + time_stride: stride for the final time layer, after the merge. + context: context for 1x1 conv in the decoder. + context_enc: context for 1x1 conv in the encoder. + norm_starts: layer at which group norm starts being used. + decoder layers are numbered in reverse order. + norm_groups: number of groups for group norm. + dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both. + dconv_depth: depth of residual DConv branch. + dconv_comp: compression of DConv branch. + dconv_attn: adds attention layers in DConv branch starting at this layer. + dconv_lstm: adds a LSTM layer in DConv branch starting at this layer. + dconv_init: initial scale for the DConv branch LayerScale. + bottom_channels: if >0 it adds a linear layer (1x1 Conv) before and after the + transformer in order to change the number of channels + t_layers: number of layers in each branch (waveform and spec) of the transformer + t_emb: "sin", "cape" or "scaled" + t_hidden_scale: the hidden scale of the Feedforward parts of the transformer + for instance if C = 384 (the number of channels in the transformer) and + t_hidden_scale = 4.0 then the intermediate layer of the FFN has dimension + 384 * 4 = 1536 + t_heads: number of heads for the transformer + t_dropout: dropout in the transformer + t_max_positions: max_positions for the "scaled" positional embedding, only + useful if t_emb="scaled" + t_norm_in: (bool) norm before addinf positional embedding and getting into the + transformer layers + t_norm_in_group: (bool) if True while t_norm_in=True, the norm is on all the + timesteps (GroupNorm with group=1) + t_group_norm: (bool) if True, the norms of the Encoder Layers are on all the + timesteps (GroupNorm with group=1) + t_norm_first: (bool) if True the norm is before the attention and before the FFN + t_norm_out: (bool) if True, there is a GroupNorm (group=1) at the end of each layer + t_max_period: (float) denominator in the sinusoidal embedding expression + t_weight_decay: (float) weight decay for the transformer + t_lr: (float) specific learning rate for the transformer + t_layer_scale: (bool) Layer Scale for the transformer + t_gelu: (bool) activations of the transformer are GeLU if True, ReLU else + t_weight_pos_embed: (float) weighting of the positional embedding + t_cape_mean_normalize: (bool) if t_emb="cape", normalisation of positional embeddings + see: https://arxiv.org/abs/2106.03143 + t_cape_augment: (bool) if t_emb="cape", must be True during training and False + during the inference, see: https://arxiv.org/abs/2106.03143 + t_cape_glob_loc_scale: (list of 3 floats) if t_emb="cape", CAPE parameters + see: https://arxiv.org/abs/2106.03143 + t_sparse_self_attn: (bool) if True, the self attentions are sparse + t_sparse_cross_attn: (bool) if True, the cross-attentions are sparse (don't use it + unless you designed really specific masks) + t_mask_type: (str) can be "diag", "jmask", "random", "global" or any combination + with '_' between: i.e. "diag_jmask_random" (note that this is permutation + invariant i.e. "diag_jmask_random" is equivalent to "jmask_random_diag") + t_mask_random_seed: (int) if "random" is in t_mask_type, controls the seed + that generated the random part of the mask + t_sparse_attn_window: (int) if "diag" is in t_mask_type, for a query (i), and + a key (j), the mask is True id |i-j|<=t_sparse_attn_window + t_global_window: (int) if "global" is in t_mask_type, mask[:t_global_window, :] + and mask[:, :t_global_window] will be True + t_sparsity: (float) if "random" is in t_mask_type, t_sparsity is the sparsity + level of the random part of the mask. + t_cross_first: (bool) if True cross attention is the first layer of the + transformer (False seems to be better) + rescale: weight rescaling trick + use_train_segment: (bool) if True, the actual size that is used during the + training is used during inference. + """ + super().__init__() + self.num_subbands = num_subbands + self.cac = cac + self.wiener_residual = wiener_residual + self.audio_channels = audio_channels + self.sources = sources + self.kernel_size = kernel_size + self.context = context + self.stride = stride + self.depth = depth + self.bottom_channels = bottom_channels + self.channels = channels + self.samplerate = samplerate + self.segment = segment + self.use_train_segment = use_train_segment + self.nfft = nfft + self.hop_length = nfft // 4 + self.wiener_iters = wiener_iters + self.end_iters = end_iters + self.freq_emb = None + assert wiener_iters == end_iters + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + self.tencoder = nn.ModuleList() + self.tdecoder = nn.ModuleList() + + chin = audio_channels + chin_z = chin # number of channels for the freq branch + if self.cac: + chin_z *= 2 + if self.num_subbands > 1: + chin_z *= self.num_subbands + chout = channels_time or channels + chout_z = channels + freqs = nfft // 2 + + for index in range(depth): + norm = index >= norm_starts + freq = freqs > 1 + stri = stride + ker = kernel_size + if not freq: + assert freqs == 1 + ker = time_stride * 2 + stri = time_stride + + pad = True + last_freq = False + if freq and freqs <= kernel_size: + ker = freqs + pad = False + last_freq = True + + kw = { + "kernel_size": ker, + "stride": stri, + "freq": freq, + "pad": pad, + "norm": norm, + "rewrite": rewrite, + "norm_groups": norm_groups, + "dconv_kw": { + "depth": dconv_depth, + "compress": dconv_comp, + "init": dconv_init, + "gelu": True, + }, + } + kwt = dict(kw) + kwt["freq"] = 0 + kwt["kernel_size"] = kernel_size + kwt["stride"] = stride + kwt["pad"] = True + kw_dec = dict(kw) + multi = False + if multi_freqs and index < multi_freqs_depth: + multi = True + kw_dec["context_freq"] = False + + if last_freq: + chout_z = max(chout, chout_z) + chout = chout_z + + enc = HEncLayer( + chin_z, chout_z, dconv=dconv_mode & 1, context=context_enc, **kw + ) + if freq: + tenc = HEncLayer( + chin, + chout, + dconv=dconv_mode & 1, + context=context_enc, + empty=last_freq, + **kwt, + ) + self.tencoder.append(tenc) + + if multi: + enc = MultiWrap(enc, multi_freqs) + self.encoder.append(enc) + if index == 0: + chin = self.audio_channels * len(self.sources) + chin_z = chin + if self.cac: + chin_z *= 2 + if self.num_subbands > 1: + chin_z *= self.num_subbands + dec = HDecLayer( + chout_z, + chin_z, + dconv=dconv_mode & 2, + last=index == 0, + context=context, + **kw_dec, + ) + if multi: + dec = MultiWrap(dec, multi_freqs) + if freq: + tdec = HDecLayer( + chout, + chin, + dconv=dconv_mode & 2, + empty=last_freq, + last=index == 0, + context=context, + **kwt, + ) + self.tdecoder.insert(0, tdec) + self.decoder.insert(0, dec) + + chin = chout + chin_z = chout_z + chout = int(growth * chout) + chout_z = int(growth * chout_z) + if freq: + if freqs <= kernel_size: + freqs = 1 + else: + freqs //= stride + if index == 0 and freq_emb: + self.freq_emb = ScaledEmbedding( + freqs, chin_z, smooth=emb_smooth, scale=emb_scale + ) + self.freq_emb_scale = freq_emb + + if rescale: + rescale_module(self, reference=rescale) + + transformer_channels = channels * growth ** (depth - 1) + if bottom_channels: + self.channel_upsampler = nn.Conv1d(transformer_channels, bottom_channels, 1) + self.channel_downsampler = nn.Conv1d( + bottom_channels, transformer_channels, 1 + ) + self.channel_upsampler_t = nn.Conv1d( + transformer_channels, bottom_channels, 1 + ) + self.channel_downsampler_t = nn.Conv1d( + bottom_channels, transformer_channels, 1 + ) + + transformer_channels = bottom_channels + + if t_layers > 0: + self.crosstransformer = CrossTransformerEncoder( + dim=transformer_channels, + emb=t_emb, + hidden_scale=t_hidden_scale, + num_heads=t_heads, + num_layers=t_layers, + cross_first=t_cross_first, + dropout=t_dropout, + max_positions=t_max_positions, + norm_in=t_norm_in, + norm_in_group=t_norm_in_group, + group_norm=t_group_norm, + norm_first=t_norm_first, + norm_out=t_norm_out, + max_period=t_max_period, + weight_decay=t_weight_decay, + lr=t_lr, + layer_scale=t_layer_scale, + gelu=t_gelu, + sin_random_shift=t_sin_random_shift, + weight_pos_embed=t_weight_pos_embed, + cape_mean_normalize=t_cape_mean_normalize, + cape_augment=t_cape_augment, + cape_glob_loc_scale=t_cape_glob_loc_scale, + sparse_self_attn=t_sparse_self_attn, + sparse_cross_attn=t_sparse_cross_attn, + mask_type=t_mask_type, + mask_random_seed=t_mask_random_seed, + sparse_attn_window=t_sparse_attn_window, + global_window=t_global_window, + sparsity=t_sparsity, + auto_sparsity=t_auto_sparsity, + ) + else: + self.crosstransformer = None + + def _spec(self, x): + hl = self.hop_length + nfft = self.nfft + x0 = x # noqa + + # We re-pad the signal in order to keep the property + # that the size of the output is exactly the size of the input + # divided by the stride (here hop_length), when divisible. + # This is achieved by padding by 1/4th of the kernel size (here nfft). + # which is not supported by torch.stft. + # Having all convolution operations follow this convention allow to easily + # align the time and frequency branches later on. + assert hl == nfft // 4 + le = int(math.ceil(x.shape[-1] / hl)) + pad = hl // 2 * 3 + x = pad1d(x, (pad, pad + le * hl - x.shape[-1]), mode="reflect") + + z = spectro(x, nfft, hl)[..., :-1, :] + assert z.shape[-1] == le + 4, (z.shape, x.shape, le) + z = z[..., 2 : 2 + le] + return z + + def _ispec(self, z, length=None, scale=0): + hl = self.hop_length // (4**scale) + z = F.pad(z, (0, 0, 0, 1)) + z = F.pad(z, (2, 2)) + pad = hl // 2 * 3 + le = hl * int(math.ceil(length / hl)) + 2 * pad + x = ispectro(z, hl, length=le) + x = x[..., pad : pad + length] + return x + + def _magnitude(self, z): + # return the magnitude of the spectrogram, except when cac is True, + # in which case we just move the complex dimension to the channel one. + if self.cac: + B, C, Fr, T = z.shape + m = torch.view_as_real(z).permute(0, 1, 4, 2, 3) + m = m.reshape(B, C * 2, Fr, T) + else: + m = z.abs() + return m + + def _mask(self, z, m): + # Apply masking given the mixture spectrogram `z` and the estimated mask `m`. + # If `cac` is True, `m` is actually a full spectrogram and `z` is ignored. + niters = self.wiener_iters + if self.cac: + B, S, C, Fr, T = m.shape + out = m.view(B, S, -1, 2, Fr, T).permute(0, 1, 2, 4, 5, 3) + out = torch.view_as_complex(out.contiguous()) + return out + if self.training: + niters = self.end_iters + if niters < 0: + z = z[:, None] + return z / (1e-8 + z.abs()) * m + else: + return self._wiener(m, z, niters) + + def _wiener(self, mag_out, mix_stft, niters): + # apply wiener filtering from OpenUnmix. + init = mix_stft.dtype + wiener_win_len = 300 + residual = self.wiener_residual + + B, S, C, Fq, T = mag_out.shape + mag_out = mag_out.permute(0, 4, 3, 2, 1) + mix_stft = torch.view_as_real(mix_stft.permute(0, 3, 2, 1)) + + outs = [] + for sample in range(B): + pos = 0 + out = [] + for pos in range(0, T, wiener_win_len): + frame = slice(pos, pos + wiener_win_len) + z_out = wiener( + mag_out[sample, frame], + mix_stft[sample, frame], + niters, + residual=residual, + ) + out.append(z_out.transpose(-1, -2)) + outs.append(torch.cat(out, dim=0)) + out = torch.view_as_complex(torch.stack(outs, 0)) + out = out.permute(0, 4, 3, 2, 1).contiguous() + if residual: + out = out[:, :-1] + assert list(out.shape) == [B, S, C, Fq, T] + return out.to(init) + + def valid_length(self, length: int): + """ + Return a length that is appropriate for evaluation. + In our case, always return the training length, unless + it is smaller than the given length, in which case this + raises an error. + """ + if not self.use_train_segment: + return length + training_length = int(self.segment * self.samplerate) + if training_length < length: + raise ValueError( + f"Given length {length} is longer than " + f"training length {training_length}" + ) + return training_length + + def cac2cws(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c, k, f // k, t) + x = x.reshape(b, c * k, f // k, t) + return x + + def cws2cac(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c // k, k, f, t) + x = x.reshape(b, c // k, f * k, t) + return x + + def forward(self, mix): + length = mix.shape[-1] + length_pre_pad = None + if self.use_train_segment: + if self.training: + self.segment = Fraction(mix.shape[-1], self.samplerate) + else: + training_length = int(self.segment * self.samplerate) + # print('Training length: {} Segment: {} Sample rate: {}'.format(training_length, self.segment, self.samplerate)) + if mix.shape[-1] < training_length: + length_pre_pad = mix.shape[-1] + mix = F.pad(mix, (0, training_length - length_pre_pad)) + # print("Mix: {}".format(mix.shape)) + # print("Length: {}".format(length)) + z = self._spec(mix) + # print("Z: {} Type: {}".format(z.shape, z.dtype)) + mag = self._magnitude(z) + x = mag + # print("MAG: {} Type: {}".format(x.shape, x.dtype)) + + if self.num_subbands > 1: + x = self.cac2cws(x) + # print("After SUBBANDS: {} Type: {}".format(x.shape, x.dtype)) + + B, C, Fq, T = x.shape + + # unlike previous Demucs, we always normalize because it is easier. + mean = x.mean(dim=(1, 2, 3), keepdim=True) + std = x.std(dim=(1, 2, 3), keepdim=True) + x = (x - mean) / (1e-5 + std) + # x will be the freq. branch input. + + # Prepare the time branch input. + xt = mix + meant = xt.mean(dim=(1, 2), keepdim=True) + stdt = xt.std(dim=(1, 2), keepdim=True) + xt = (xt - meant) / (1e-5 + stdt) + + # print("XT: {}".format(xt.shape)) + + # okay, this is a giant mess I know... + saved = [] # skip connections, freq. + saved_t = [] # skip connections, time. + lengths = [] # saved lengths to properly remove padding, freq branch. + lengths_t = [] # saved lengths for time branch. + for idx, encode in enumerate(self.encoder): + lengths.append(x.shape[-1]) + inject = None + if idx < len(self.tencoder): + # we have not yet merged branches. + lengths_t.append(xt.shape[-1]) + tenc = self.tencoder[idx] + xt = tenc(xt) + # print("Encode XT {}: {}".format(idx, xt.shape)) + if not tenc.empty: + # save for skip connection + saved_t.append(xt) + else: + # tenc contains just the first conv., so that now time and freq. + # branches have the same shape and can be merged. + inject = xt + x = encode(x, inject) + # print("Encode X {}: {}".format(idx, x.shape)) + if idx == 0 and self.freq_emb is not None: + # add frequency embedding to allow for non equivariant convolutions + # over the frequency axis. + frs = torch.arange(x.shape[-2], device=x.device) + emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x) + x = x + self.freq_emb_scale * emb + + saved.append(x) + if self.crosstransformer: + if self.bottom_channels: + b, c, f, t = x.shape + x = rearrange(x, "b c f t-> b c (f t)") + x = self.channel_upsampler(x) + x = rearrange(x, "b c (f t)-> b c f t", f=f) + xt = self.channel_upsampler_t(xt) + + x, xt = self.crosstransformer(x, xt) + # print("Cross Tran X {}, XT: {}".format(x.shape, xt.shape)) + + if self.bottom_channels: + x = rearrange(x, "b c f t-> b c (f t)") + x = self.channel_downsampler(x) + x = rearrange(x, "b c (f t)-> b c f t", f=f) + xt = self.channel_downsampler_t(xt) + + for idx, decode in enumerate(self.decoder): + skip = saved.pop(-1) + x, pre = decode(x, skip, lengths.pop(-1)) + # print('Decode {} X: {}'.format(idx, x.shape)) + # `pre` contains the output just before final transposed convolution, + # which is used when the freq. and time branch separate. + + offset = self.depth - len(self.tdecoder) + if idx >= offset: + tdec = self.tdecoder[idx - offset] + length_t = lengths_t.pop(-1) + if tdec.empty: + assert pre.shape[2] == 1, pre.shape + pre = pre[:, :, 0] + xt, _ = tdec(pre, None, length_t) + else: + skip = saved_t.pop(-1) + xt, _ = tdec(xt, skip, length_t) + # print('Decode {} XT: {}'.format(idx, xt.shape)) + + # Let's make sure we used all stored skip connections. + assert len(saved) == 0 + assert len(lengths_t) == 0 + assert len(saved_t) == 0 + + S = len(self.sources) + + if self.num_subbands > 1: + x = x.view(B, -1, Fq, T) + # print("X view 1: {}".format(x.shape)) + x = self.cws2cac(x) + # print("X view 2: {}".format(x.shape)) + + x = x.view(B, S, -1, Fq * self.num_subbands, T) + x = x * std[:, None] + mean[:, None] + # print("X returned: {}".format(x.shape)) + + zout = self._mask(z, x) + if self.use_train_segment: + if self.training: + x = self._ispec(zout, length) + else: + x = self._ispec(zout, training_length) + else: + x = self._ispec(zout, length) + + if self.use_train_segment: + if self.training: + xt = xt.view(B, S, -1, length) + else: + xt = xt.view(B, S, -1, training_length) + else: + xt = xt.view(B, S, -1, length) + xt = xt * stdt[:, None] + meant[:, None] + x = xt + x + if length_pre_pad: + x = x[..., :length_pre_pad] + return x + + +def get_model(args): + extra = { + "sources": list(args.training.instruments), + "audio_channels": args.training.channels, + "samplerate": args.training.samplerate, + # 'segment': args.model_segment or 4 * args.dset.segment, + "segment": args.training.segment, + } + klass = { + "demucs": Demucs, + "hdemucs": HDemucs, + "htdemucs": HTDemucs, + }[args.model] + kw = OmegaConf.to_container(getattr(args, args.model), resolve=True) + model = klass(**extra, **kw) + return model diff --git a/programs/music_separation_code/models/mdx23c_tfc_tdf_v3.py b/programs/music_separation_code/models/mdx23c_tfc_tdf_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..ad89c85b0c643a464dbaf1a8202ee539cef3f9be --- /dev/null +++ b/programs/music_separation_code/models/mdx23c_tfc_tdf_v3.py @@ -0,0 +1,260 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + + +class STFT: + def __init__(self, config): + self.n_fft = config.n_fft + self.hop_length = config.hop_length + self.window = torch.hann_window(window_length=self.n_fft, periodic=True) + self.dim_f = config.dim_f + + def __call__(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-2] + c, t = x.shape[-2:] + x = x.reshape([-1, t]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop_length, + window=window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape( + [*batch_dims, c * 2, -1, x.shape[-1]] + ) + return x[..., : self.dim_f, :] + + def inverse(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-3] + c, f, t = x.shape[-3:] + n = self.n_fft // 2 + 1 + f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device) + x = torch.cat([x, f_pad], -2) + x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t]) + x = x.permute([0, 2, 3, 1]) + x = x[..., 0] + x[..., 1] * 1.0j + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True + ) + x = x.reshape([*batch_dims, 2, -1]) + return x + + +def get_norm(norm_type): + def norm(c, norm_type): + if norm_type == "BatchNorm": + return nn.BatchNorm2d(c) + elif norm_type == "InstanceNorm": + return nn.InstanceNorm2d(c, affine=True) + elif "GroupNorm" in norm_type: + g = int(norm_type.replace("GroupNorm", "")) + return nn.GroupNorm(num_groups=g, num_channels=c) + else: + return nn.Identity() + + return partial(norm, norm_type=norm_type) + + +def get_act(act_type): + if act_type == "gelu": + return nn.GELU() + elif act_type == "relu": + return nn.ReLU() + elif act_type[:3] == "elu": + alpha = float(act_type.replace("elu", "")) + return nn.ELU(alpha) + else: + raise Exception + + +class Upscale(nn.Module): + def __init__(self, in_c, out_c, scale, norm, act): + super().__init__() + self.conv = nn.Sequential( + norm(in_c), + act, + nn.ConvTranspose2d( + in_channels=in_c, + out_channels=out_c, + kernel_size=scale, + stride=scale, + bias=False, + ), + ) + + def forward(self, x): + return self.conv(x) + + +class Downscale(nn.Module): + def __init__(self, in_c, out_c, scale, norm, act): + super().__init__() + self.conv = nn.Sequential( + norm(in_c), + act, + nn.Conv2d( + in_channels=in_c, + out_channels=out_c, + kernel_size=scale, + stride=scale, + bias=False, + ), + ) + + def forward(self, x): + return self.conv(x) + + +class TFC_TDF(nn.Module): + def __init__(self, in_c, c, l, f, bn, norm, act): + super().__init__() + + self.blocks = nn.ModuleList() + for i in range(l): + block = nn.Module() + + block.tfc1 = nn.Sequential( + norm(in_c), + act, + nn.Conv2d(in_c, c, 3, 1, 1, bias=False), + ) + block.tdf = nn.Sequential( + norm(c), + act, + nn.Linear(f, f // bn, bias=False), + norm(c), + act, + nn.Linear(f // bn, f, bias=False), + ) + block.tfc2 = nn.Sequential( + norm(c), + act, + nn.Conv2d(c, c, 3, 1, 1, bias=False), + ) + block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False) + + self.blocks.append(block) + in_c = c + + def forward(self, x): + for block in self.blocks: + s = block.shortcut(x) + x = block.tfc1(x) + x = x + block.tdf(x) + x = block.tfc2(x) + x = x + s + return x + + +class TFC_TDF_net(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + norm = get_norm(norm_type=config.model.norm) + act = get_act(act_type=config.model.act) + + self.num_target_instruments = ( + 1 if config.training.target_instrument else len(config.training.instruments) + ) + self.num_subbands = config.model.num_subbands + + dim_c = self.num_subbands * config.audio.num_channels * 2 + n = config.model.num_scales + scale = config.model.scale + l = config.model.num_blocks_per_scale + c = config.model.num_channels + g = config.model.growth + bn = config.model.bottleneck_factor + f = config.audio.dim_f // self.num_subbands + + self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False) + + self.encoder_blocks = nn.ModuleList() + for i in range(n): + block = nn.Module() + block.tfc_tdf = TFC_TDF(c, c, l, f, bn, norm, act) + block.downscale = Downscale(c, c + g, scale, norm, act) + f = f // scale[1] + c += g + self.encoder_blocks.append(block) + + self.bottleneck_block = TFC_TDF(c, c, l, f, bn, norm, act) + + self.decoder_blocks = nn.ModuleList() + for i in range(n): + block = nn.Module() + block.upscale = Upscale(c, c - g, scale, norm, act) + f = f * scale[1] + c -= g + block.tfc_tdf = TFC_TDF(2 * c, c, l, f, bn, norm, act) + self.decoder_blocks.append(block) + + self.final_conv = nn.Sequential( + nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False), + act, + nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False), + ) + + self.stft = STFT(config.audio) + + def cac2cws(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c, k, f // k, t) + x = x.reshape(b, c * k, f // k, t) + return x + + def cws2cac(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c // k, k, f, t) + x = x.reshape(b, c // k, f * k, t) + return x + + def forward(self, x): + + x = self.stft(x) + + mix = x = self.cac2cws(x) + + first_conv_out = x = self.first_conv(x) + + x = x.transpose(-1, -2) + + encoder_outputs = [] + for block in self.encoder_blocks: + x = block.tfc_tdf(x) + encoder_outputs.append(x) + x = block.downscale(x) + + x = self.bottleneck_block(x) + + for block in self.decoder_blocks: + x = block.upscale(x) + x = torch.cat([x, encoder_outputs.pop()], 1) + x = block.tfc_tdf(x) + + x = x.transpose(-1, -2) + + x = x * first_conv_out # reduce artifacts + + x = self.final_conv(torch.cat([mix, x], 1)) + + x = self.cws2cac(x) + + if self.num_target_instruments > 1: + b, c, f, t = x.shape + x = x.reshape(b, self.num_target_instruments, -1, f, t) + + x = self.stft.inverse(x) + + return x diff --git a/programs/music_separation_code/models/scnet/__init__.py b/programs/music_separation_code/models/scnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5f6ecefede9345237623066dd21ebd8253af1c60 --- /dev/null +++ b/programs/music_separation_code/models/scnet/__init__.py @@ -0,0 +1 @@ +from .scnet import SCNet diff --git a/programs/music_separation_code/models/scnet/scnet.py b/programs/music_separation_code/models/scnet/scnet.py new file mode 100644 index 0000000000000000000000000000000000000000..71807479eb10bc803f2fb1f7d7f193bd30416167 --- /dev/null +++ b/programs/music_separation_code/models/scnet/scnet.py @@ -0,0 +1,419 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from collections import deque +from .separation import SeparationNet +import typing as tp +import math + + +class Swish(nn.Module): + def forward(self, x): + return x * x.sigmoid() + + +class ConvolutionModule(nn.Module): + """ + Convolution Module in SD block. + + Args: + channels (int): input/output channels. + depth (int): number of layers in the residual branch. Each layer has its own + compress (float): amount of channel compression. + kernel (int): kernel size for the convolutions. + """ + + def __init__(self, channels, depth=2, compress=4, kernel=3): + super().__init__() + assert kernel % 2 == 1 + self.depth = abs(depth) + hidden_size = int(channels / compress) + norm = lambda d: nn.GroupNorm(1, d) + self.layers = nn.ModuleList([]) + for _ in range(self.depth): + padding = kernel // 2 + mods = [ + norm(channels), + nn.Conv1d(channels, hidden_size * 2, kernel, padding=padding), + nn.GLU(1), + nn.Conv1d( + hidden_size, + hidden_size, + kernel, + padding=padding, + groups=hidden_size, + ), + norm(hidden_size), + Swish(), + nn.Conv1d(hidden_size, channels, 1), + ] + layer = nn.Sequential(*mods) + self.layers.append(layer) + + def forward(self, x): + for layer in self.layers: + x = x + layer(x) + return x + + +class FusionLayer(nn.Module): + """ + A FusionLayer within the decoder. + + Args: + - channels (int): Number of input channels. + - kernel_size (int, optional): Kernel size for the convolutional layer, defaults to 3. + - stride (int, optional): Stride for the convolutional layer, defaults to 1. + - padding (int, optional): Padding for the convolutional layer, defaults to 1. + """ + + def __init__(self, channels, kernel_size=3, stride=1, padding=1): + super(FusionLayer, self).__init__() + self.conv = nn.Conv2d( + channels * 2, channels * 2, kernel_size, stride=stride, padding=padding + ) + + def forward(self, x, skip=None): + if skip is not None: + x += skip + x = x.repeat(1, 2, 1, 1) + x = self.conv(x) + x = F.glu(x, dim=1) + return x + + +class SDlayer(nn.Module): + """ + Implements a Sparse Down-sample Layer for processing different frequency bands separately. + + Args: + - channels_in (int): Input channel count. + - channels_out (int): Output channel count. + - band_configs (dict): A dictionary containing configuration for each frequency band. + Keys are 'low', 'mid', 'high' for each band, and values are + dictionaries with keys 'SR', 'stride', and 'kernel' for proportion, + stride, and kernel size, respectively. + """ + + def __init__(self, channels_in, channels_out, band_configs): + super(SDlayer, self).__init__() + + # Initializing convolutional layers for each band + self.convs = nn.ModuleList() + self.strides = [] + self.kernels = [] + for config in band_configs.values(): + self.convs.append( + nn.Conv2d( + channels_in, + channels_out, + (config["kernel"], 1), + (config["stride"], 1), + (0, 0), + ) + ) + self.strides.append(config["stride"]) + self.kernels.append(config["kernel"]) + + # Saving rate proportions for determining splits + self.SR_low = band_configs["low"]["SR"] + self.SR_mid = band_configs["mid"]["SR"] + + def forward(self, x): + B, C, Fr, T = x.shape + # Define splitting points based on sampling rates + splits = [ + (0, math.ceil(Fr * self.SR_low)), + (math.ceil(Fr * self.SR_low), math.ceil(Fr * (self.SR_low + self.SR_mid))), + (math.ceil(Fr * (self.SR_low + self.SR_mid)), Fr), + ] + + # Processing each band with the corresponding convolution + outputs = [] + original_lengths = [] + for conv, stride, kernel, (start, end) in zip( + self.convs, self.strides, self.kernels, splits + ): + extracted = x[:, :, start:end, :] + original_lengths.append(end - start) + current_length = extracted.shape[2] + + # padding + if stride == 1: + total_padding = kernel - stride + else: + total_padding = (stride - current_length % stride) % stride + pad_left = total_padding // 2 + pad_right = total_padding - pad_left + + padded = F.pad(extracted, (0, 0, pad_left, pad_right)) + + output = conv(padded) + outputs.append(output) + + return outputs, original_lengths + + +class SUlayer(nn.Module): + """ + Implements a Sparse Up-sample Layer in decoder. + + Args: + - channels_in: The number of input channels. + - channels_out: The number of output channels. + - convtr_configs: Dictionary containing the configurations for transposed convolutions. + """ + + def __init__(self, channels_in, channels_out, band_configs): + super(SUlayer, self).__init__() + + # Initializing convolutional layers for each band + self.convtrs = nn.ModuleList( + [ + nn.ConvTranspose2d( + channels_in, + channels_out, + [config["kernel"], 1], + [config["stride"], 1], + ) + for _, config in band_configs.items() + ] + ) + + def forward(self, x, lengths, origin_lengths): + B, C, Fr, T = x.shape + # Define splitting points based on input lengths + splits = [ + (0, lengths[0]), + (lengths[0], lengths[0] + lengths[1]), + (lengths[0] + lengths[1], None), + ] + # Processing each band with the corresponding convolution + outputs = [] + for idx, (convtr, (start, end)) in enumerate(zip(self.convtrs, splits)): + out = convtr(x[:, :, start:end, :]) + # Calculate the distance to trim the output symmetrically to original length + current_Fr_length = out.shape[2] + dist = abs(origin_lengths[idx] - current_Fr_length) // 2 + + # Trim the output to the original length symmetrically + trimmed_out = out[:, :, dist : dist + origin_lengths[idx], :] + + outputs.append(trimmed_out) + + # Concatenate trimmed outputs along the frequency dimension to return the final tensor + x = torch.cat(outputs, dim=2) + + return x + + +class SDblock(nn.Module): + """ + Implements a simplified Sparse Down-sample block in encoder. + + Args: + - channels_in (int): Number of input channels. + - channels_out (int): Number of output channels. + - band_config (dict): Configuration for the SDlayer specifying band splits and convolutions. + - conv_config (dict): Configuration for convolution modules applied to each band. + - depths (list of int): List specifying the convolution depths for low, mid, and high frequency bands. + """ + + def __init__( + self, + channels_in, + channels_out, + band_configs={}, + conv_config={}, + depths=[3, 2, 1], + kernel_size=3, + ): + super(SDblock, self).__init__() + self.SDlayer = SDlayer(channels_in, channels_out, band_configs) + + # Dynamically create convolution modules for each band based on depths + self.conv_modules = nn.ModuleList( + [ConvolutionModule(channels_out, depth, **conv_config) for depth in depths] + ) + # Set the kernel_size to an odd number. + self.globalconv = nn.Conv2d( + channels_out, channels_out, kernel_size, 1, (kernel_size - 1) // 2 + ) + + def forward(self, x): + bands, original_lengths = self.SDlayer(x) + # B, C, f, T = band.shape + bands = [ + F.gelu( + conv(band.permute(0, 2, 1, 3).reshape(-1, band.shape[1], band.shape[3])) + .view(band.shape[0], band.shape[2], band.shape[1], band.shape[3]) + .permute(0, 2, 1, 3) + ) + for conv, band in zip(self.conv_modules, bands) + ] + lengths = [band.size(-2) for band in bands] + full_band = torch.cat(bands, dim=2) + skip = full_band + + output = self.globalconv(full_band) + + return output, skip, lengths, original_lengths + + +class SCNet(nn.Module): + """ + The implementation of SCNet: Sparse Compression Network for Music Source Separation. Paper: https://arxiv.org/abs/2401.13276.pdf + + Args: + - sources (List[str]): List of sources to be separated. + - audio_channels (int): Number of audio channels. + - nfft (int): Number of FFTs to determine the frequency dimension of the input. + - hop_size (int): Hop size for the STFT. + - win_size (int): Window size for STFT. + - normalized (bool): Whether to normalize the STFT. + - dims (List[int]): List of channel dimensions for each block. + - band_SR (List[float]): The proportion of each frequency band. + - band_stride (List[int]): The down-sampling ratio of each frequency band. + - band_kernel (List[int]): The kernel sizes for down-sampling convolution in each frequency band + - conv_depths (List[int]): List specifying the number of convolution modules in each SD block. + - compress (int): Compression factor for convolution module. + - conv_kernel (int): Kernel size for convolution layer in convolution module. + - num_dplayer (int): Number of dual-path layers. + - expand (int): Expansion factor in the dual-path RNN, default is 1. + + """ + + def __init__( + self, + sources=["drums", "bass", "other", "vocals"], + audio_channels=2, + # Main structure + dims=[4, 32, 64, 128], # dims = [4, 64, 128, 256] in SCNet-large + # STFT + nfft=4096, + hop_size=1024, + win_size=4096, + normalized=True, + # SD/SU layer + band_SR=[0.175, 0.392, 0.433], + band_stride=[1, 4, 16], + band_kernel=[3, 4, 16], + # Convolution Module + conv_depths=[3, 2, 1], + compress=4, + conv_kernel=3, + # Dual-path RNN + num_dplayer=6, + expand=1, + ): + super().__init__() + self.sources = sources + self.audio_channels = audio_channels + self.dims = dims + band_keys = ["low", "mid", "high"] + self.band_configs = { + band_keys[i]: { + "SR": band_SR[i], + "stride": band_stride[i], + "kernel": band_kernel[i], + } + for i in range(len(band_keys)) + } + self.hop_length = hop_size + self.conv_config = { + "compress": compress, + "kernel": conv_kernel, + } + + self.stft_config = { + "n_fft": nfft, + "hop_length": hop_size, + "win_length": win_size, + "center": True, + "normalized": normalized, + } + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + for index in range(len(dims) - 1): + enc = SDblock( + channels_in=dims[index], + channels_out=dims[index + 1], + band_configs=self.band_configs, + conv_config=self.conv_config, + depths=conv_depths, + ) + self.encoder.append(enc) + + dec = nn.Sequential( + FusionLayer(channels=dims[index + 1]), + SUlayer( + channels_in=dims[index + 1], + channels_out=( + dims[index] if index != 0 else dims[index] * len(sources) + ), + band_configs=self.band_configs, + ), + ) + self.decoder.insert(0, dec) + + self.separation_net = SeparationNet( + channels=dims[-1], + expand=expand, + num_layers=num_dplayer, + ) + + def forward(self, x): + # B, C, L = x.shape + B = x.shape[0] + # In the initial padding, ensure that the number of frames after the STFT (the length of the T dimension) is even, + # so that the RFFT operation can be used in the separation network. + padding = self.hop_length - x.shape[-1] % self.hop_length + if (x.shape[-1] + padding) // self.hop_length % 2 == 0: + padding += self.hop_length + x = F.pad(x, (0, padding)) + + # STFT + L = x.shape[-1] + x = x.reshape(-1, L) + x = torch.stft(x, **self.stft_config, return_complex=True) + x = torch.view_as_real(x) + x = x.permute(0, 3, 1, 2).reshape( + x.shape[0] // self.audio_channels, + x.shape[3] * self.audio_channels, + x.shape[1], + x.shape[2], + ) + + B, C, Fr, T = x.shape + + save_skip = deque() + save_lengths = deque() + save_original_lengths = deque() + # encoder + for sd_layer in self.encoder: + x, skip, lengths, original_lengths = sd_layer(x) + save_skip.append(skip) + save_lengths.append(lengths) + save_original_lengths.append(original_lengths) + + # separation + x = self.separation_net(x) + + # decoder + for fusion_layer, su_layer in self.decoder: + x = fusion_layer(x, save_skip.pop()) + x = su_layer(x, save_lengths.pop(), save_original_lengths.pop()) + + # output + n = self.dims[0] + x = x.view(B, n, -1, Fr, T) + x = x.reshape(-1, 2, Fr, T).permute(0, 2, 3, 1) + x = torch.view_as_complex(x.contiguous()) + x = torch.istft(x, **self.stft_config) + x = x.reshape(B, len(self.sources), self.audio_channels, -1) + + x = x[:, :, :, :-padding] + + return x diff --git a/programs/music_separation_code/models/scnet/separation.py b/programs/music_separation_code/models/scnet/separation.py new file mode 100644 index 0000000000000000000000000000000000000000..8965e2c8b14fa2c1fb6a2766e840c45128e1303d --- /dev/null +++ b/programs/music_separation_code/models/scnet/separation.py @@ -0,0 +1,129 @@ +import torch +import torch.nn as nn +from torch.nn.modules.rnn import LSTM + + +class FeatureConversion(nn.Module): + """ + Integrates into the adjacent Dual-Path layer. + + Args: + channels (int): Number of input channels. + inverse (bool): If True, uses ifft; otherwise, uses rfft. + """ + + def __init__(self, channels, inverse): + super().__init__() + self.inverse = inverse + self.channels = channels + + def forward(self, x): + # B, C, F, T = x.shape + if self.inverse: + x = x.float() + x_r = x[:, : self.channels // 2, :, :] + x_i = x[:, self.channels // 2 :, :, :] + x = torch.complex(x_r, x_i) + x = torch.fft.irfft(x, dim=3, norm="ortho") + else: + x = x.float() + x = torch.fft.rfft(x, dim=3, norm="ortho") + x_real = x.real + x_imag = x.imag + x = torch.cat([x_real, x_imag], dim=1) + return x + + +class DualPathRNN(nn.Module): + """ + Dual-Path RNN in Separation Network. + + Args: + d_model (int): The number of expected features in the input (input_size). + expand (int): Expansion factor used to calculate the hidden_size of LSTM. + bidirectional (bool): If True, becomes a bidirectional LSTM. + """ + + def __init__(self, d_model, expand, bidirectional=True): + super(DualPathRNN, self).__init__() + + self.d_model = d_model + self.hidden_size = d_model * expand + self.bidirectional = bidirectional + # Initialize LSTM layers and normalization layers + self.lstm_layers = nn.ModuleList( + [self._init_lstm_layer(self.d_model, self.hidden_size) for _ in range(2)] + ) + self.linear_layers = nn.ModuleList( + [nn.Linear(self.hidden_size * 2, self.d_model) for _ in range(2)] + ) + self.norm_layers = nn.ModuleList([nn.GroupNorm(1, d_model) for _ in range(2)]) + + def _init_lstm_layer(self, d_model, hidden_size): + return LSTM( + d_model, + hidden_size, + num_layers=1, + bidirectional=self.bidirectional, + batch_first=True, + ) + + def forward(self, x): + B, C, F, T = x.shape + + # Process dual-path rnn + original_x = x + # Frequency-path + x = self.norm_layers[0](x) + x = x.transpose(1, 3).contiguous().view(B * T, F, C) + x, _ = self.lstm_layers[0](x) + x = self.linear_layers[0](x) + x = x.view(B, T, F, C).transpose(1, 3) + x = x + original_x + + original_x = x + # Time-path + x = self.norm_layers[1](x) + x = x.transpose(1, 2).contiguous().view(B * F, C, T).transpose(1, 2) + x, _ = self.lstm_layers[1](x) + x = self.linear_layers[1](x) + x = x.transpose(1, 2).contiguous().view(B, F, C, T).transpose(1, 2) + x = x + original_x + + return x + + +class SeparationNet(nn.Module): + """ + Implements a simplified Sparse Down-sample block in an encoder architecture. + + Args: + - channels (int): Number input channels. + - expand (int): Expansion factor used to calculate the hidden_size of LSTM. + - num_layers (int): Number of dual-path layers. + """ + + def __init__(self, channels, expand=1, num_layers=6): + super(SeparationNet, self).__init__() + + self.num_layers = num_layers + + self.dp_modules = nn.ModuleList( + [ + DualPathRNN(channels * (2 if i % 2 == 1 else 1), expand) + for i in range(num_layers) + ] + ) + + self.feature_conversion = nn.ModuleList( + [ + FeatureConversion(channels * 2, inverse=False if i % 2 == 0 else True) + for i in range(num_layers) + ] + ) + + def forward(self, x): + for i in range(self.num_layers): + x = self.dp_modules[i](x) + x = self.feature_conversion[i](x) + return x diff --git a/programs/music_separation_code/models/scnet_unofficial/__init__.py b/programs/music_separation_code/models/scnet_unofficial/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..298d9939f5177c6b24cca743c83a351a84a6ffce --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/__init__.py @@ -0,0 +1 @@ +from models.scnet_unofficial.scnet import SCNet diff --git a/programs/music_separation_code/models/scnet_unofficial/modules/__init__.py b/programs/music_separation_code/models/scnet_unofficial/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..69617bb15044d9bbfd0211fcdfa0fa605b01c048 --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/modules/__init__.py @@ -0,0 +1,3 @@ +from models.scnet_unofficial.modules.dualpath_rnn import DualPathRNN +from models.scnet_unofficial.modules.sd_encoder import SDBlock +from models.scnet_unofficial.modules.su_decoder import SUBlock diff --git a/programs/music_separation_code/models/scnet_unofficial/modules/dualpath_rnn.py b/programs/music_separation_code/models/scnet_unofficial/modules/dualpath_rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..644d05a19cc83798402ee08f269b08a52eaeac09 --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/modules/dualpath_rnn.py @@ -0,0 +1,238 @@ +import torch +import torch.nn as nn +import torch.nn.functional as Func + + +class RMSNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + return Func.normalize(x, dim=-1) * self.scale * self.gamma + + +class MambaModule(nn.Module): + def __init__(self, d_model, d_state, d_conv, d_expand): + super().__init__() + self.norm = RMSNorm(dim=d_model) + self.mamba = Mamba( + d_model=d_model, d_state=d_state, d_conv=d_conv, d_expand=d_expand + ) + + def forward(self, x): + x = x + self.mamba(self.norm(x)) + return x + + +class RNNModule(nn.Module): + """ + RNNModule class implements a recurrent neural network module with LSTM cells. + + Args: + - input_dim (int): Dimensionality of the input features. + - hidden_dim (int): Dimensionality of the hidden state of the LSTM. + - bidirectional (bool, optional): If True, uses bidirectional LSTM. Defaults to True. + + Shapes: + - Input: (B, T, D) where + B is batch size, + T is sequence length, + D is input dimensionality. + - Output: (B, T, D) where + B is batch size, + T is sequence length, + D is input dimensionality. + """ + + def __init__(self, input_dim: int, hidden_dim: int, bidirectional: bool = True): + """ + Initializes RNNModule with input dimension, hidden dimension, and bidirectional flag. + """ + super().__init__() + self.groupnorm = nn.GroupNorm(num_groups=1, num_channels=input_dim) + self.rnn = nn.LSTM( + input_dim, hidden_dim, batch_first=True, bidirectional=bidirectional + ) + self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, input_dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the RNNModule. + + Args: + - x (torch.Tensor): Input tensor of shape (B, T, D). + + Returns: + - torch.Tensor: Output tensor of shape (B, T, D). + """ + x = x.transpose(1, 2) + x = self.groupnorm(x) + x = x.transpose(1, 2) + + x, (hidden, _) = self.rnn(x) + x = self.fc(x) + return x + + +class RFFTModule(nn.Module): + """ + RFFTModule class implements a module for performing real-valued Fast Fourier Transform (FFT) + or its inverse on input tensors. + + Args: + - inverse (bool, optional): If False, performs forward FFT. If True, performs inverse FFT. Defaults to False. + + Shapes: + - Input: (B, F, T, D) where + B is batch size, + F is the number of features, + T is sequence length, + D is input dimensionality. + - Output: (B, F, T // 2 + 1, D * 2) if performing forward FFT. + (B, F, T, D // 2, 2) if performing inverse FFT. + """ + + def __init__(self, inverse: bool = False): + """ + Initializes RFFTModule with inverse flag. + """ + super().__init__() + self.inverse = inverse + + def forward(self, x: torch.Tensor, time_dim: int) -> torch.Tensor: + """ + Performs forward or inverse FFT on the input tensor x. + + Args: + - x (torch.Tensor): Input tensor of shape (B, F, T, D). + - time_dim (int): Input size of time dimension. + + Returns: + - torch.Tensor: Output tensor after FFT or its inverse operation. + """ + dtype = x.dtype + B, F, T, D = x.shape + + # RuntimeError: cuFFT only supports dimensions whose sizes are powers of two when computing in half precision + x = x.float() + + if not self.inverse: + x = torch.fft.rfft(x, dim=2) + x = torch.view_as_real(x) + x = x.reshape(B, F, T // 2 + 1, D * 2) + else: + x = x.reshape(B, F, T, D // 2, 2) + x = torch.view_as_complex(x) + x = torch.fft.irfft(x, n=time_dim, dim=2) + + x = x.to(dtype) + return x + + def extra_repr(self) -> str: + """ + Returns extra representation string with module's configuration. + """ + return f"inverse={self.inverse}" + + +class DualPathRNN(nn.Module): + """ + DualPathRNN class implements a neural network with alternating layers of RNNModule and RFFTModule. + + Args: + - n_layers (int): Number of layers in the network. + - input_dim (int): Dimensionality of the input features. + - hidden_dim (int): Dimensionality of the hidden state of the RNNModule. + + Shapes: + - Input: (B, F, T, D) where + B is batch size, + F is the number of features (frequency dimension), + T is sequence length (time dimension), + D is input dimensionality (channel dimension). + - Output: (B, F, T, D) where + B is batch size, + F is the number of features (frequency dimension), + T is sequence length (time dimension), + D is input dimensionality (channel dimension). + """ + + def __init__( + self, + n_layers: int, + input_dim: int, + hidden_dim: int, + use_mamba: bool = False, + d_state: int = 16, + d_conv: int = 4, + d_expand: int = 2, + ): + """ + Initializes DualPathRNN with the specified number of layers, input dimension, and hidden dimension. + """ + super().__init__() + + if use_mamba: + from mamba_ssm.modules.mamba_simple import Mamba + + net = MambaModule + dkwargs = { + "d_model": input_dim, + "d_state": d_state, + "d_conv": d_conv, + "d_expand": d_expand, + } + ukwargs = { + "d_model": input_dim * 2, + "d_state": d_state, + "d_conv": d_conv, + "d_expand": d_expand * 2, + } + else: + net = RNNModule + dkwargs = {"input_dim": input_dim, "hidden_dim": hidden_dim} + ukwargs = {"input_dim": input_dim * 2, "hidden_dim": hidden_dim * 2} + + self.layers = nn.ModuleList() + for i in range(1, n_layers + 1): + kwargs = dkwargs if i % 2 == 1 else ukwargs + layer = nn.ModuleList( + [ + net(**kwargs), + net(**kwargs), + RFFTModule(inverse=(i % 2 == 0)), + ] + ) + self.layers.append(layer) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the DualPathRNN. + + Args: + - x (torch.Tensor): Input tensor of shape (B, F, T, D). + + Returns: + - torch.Tensor: Output tensor of shape (B, F, T, D). + """ + + time_dim = x.shape[2] + + for time_layer, freq_layer, rfft_layer in self.layers: + B, F, T, D = x.shape + + x = x.reshape((B * F), T, D) + x = time_layer(x) + x = x.reshape(B, F, T, D) + x = x.permute(0, 2, 1, 3) + + x = x.reshape((B * T), F, D) + x = freq_layer(x) + x = x.reshape(B, T, F, D) + x = x.permute(0, 2, 1, 3) + + x = rfft_layer(x, time_dim) + + return x diff --git a/programs/music_separation_code/models/scnet_unofficial/modules/sd_encoder.py b/programs/music_separation_code/models/scnet_unofficial/modules/sd_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..742577f480693671437dc50358a1a65d251b6e9b --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/modules/sd_encoder.py @@ -0,0 +1,285 @@ +from typing import List, Tuple + +import torch +import torch.nn as nn + +from models.scnet_unofficial.utils import create_intervals + + +class Downsample(nn.Module): + """ + Downsample class implements a module for downsampling input tensors using 2D convolution. + + Args: + - input_dim (int): Dimensionality of the input channels. + - output_dim (int): Dimensionality of the output channels. + - stride (int): Stride value for the convolution operation. + + Shapes: + - Input: (B, C_in, F, T) where + B is batch size, + C_in is the number of input channels, + F is the frequency dimension, + T is the time dimension. + - Output: (B, C_out, F // stride, T) where + B is batch size, + C_out is the number of output channels, + F // stride is the downsampled frequency dimension. + + """ + + def __init__( + self, + input_dim: int, + output_dim: int, + stride: int, + ): + """ + Initializes Downsample with input dimension, output dimension, and stride. + """ + super().__init__() + self.conv = nn.Conv2d(input_dim, output_dim, 1, (stride, 1)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the Downsample module. + + Args: + - x (torch.Tensor): Input tensor of shape (B, C_in, F, T). + + Returns: + - torch.Tensor: Downsampled tensor of shape (B, C_out, F // stride, T). + """ + return self.conv(x) + + +class ConvolutionModule(nn.Module): + """ + ConvolutionModule class implements a module with a sequence of convolutional layers similar to Conformer. + + Args: + - input_dim (int): Dimensionality of the input features. + - hidden_dim (int): Dimensionality of the hidden features. + - kernel_sizes (List[int]): List of kernel sizes for the convolutional layers. + - bias (bool, optional): If True, adds a learnable bias to the output. Default is False. + + Shapes: + - Input: (B, T, D) where + B is batch size, + T is sequence length, + D is input dimensionality. + - Output: (B, T, D) where + B is batch size, + T is sequence length, + D is input dimensionality. + """ + + def __init__( + self, + input_dim: int, + hidden_dim: int, + kernel_sizes: List[int], + bias: bool = False, + ) -> None: + """ + Initializes ConvolutionModule with input dimension, hidden dimension, kernel sizes, and bias. + """ + super().__init__() + self.sequential = nn.Sequential( + nn.GroupNorm(num_groups=1, num_channels=input_dim), + nn.Conv1d( + input_dim, + 2 * hidden_dim, + kernel_sizes[0], + stride=1, + padding=(kernel_sizes[0] - 1) // 2, + bias=bias, + ), + nn.GLU(dim=1), + nn.Conv1d( + hidden_dim, + hidden_dim, + kernel_sizes[1], + stride=1, + padding=(kernel_sizes[1] - 1) // 2, + groups=hidden_dim, + bias=bias, + ), + nn.GroupNorm(num_groups=1, num_channels=hidden_dim), + nn.SiLU(), + nn.Conv1d( + hidden_dim, + input_dim, + kernel_sizes[2], + stride=1, + padding=(kernel_sizes[2] - 1) // 2, + bias=bias, + ), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the ConvolutionModule. + + Args: + - x (torch.Tensor): Input tensor of shape (B, T, D). + + Returns: + - torch.Tensor: Output tensor of shape (B, T, D). + """ + x = x.transpose(1, 2) + x = x + self.sequential(x) + x = x.transpose(1, 2) + return x + + +class SDLayer(nn.Module): + """ + SDLayer class implements a subband decomposition layer with downsampling and convolutional modules. + + Args: + - subband_interval (Tuple[float, float]): Tuple representing the frequency interval for subband decomposition. + - input_dim (int): Dimensionality of the input channels. + - output_dim (int): Dimensionality of the output channels after downsampling. + - downsample_stride (int): Stride value for the downsampling operation. + - n_conv_modules (int): Number of convolutional modules. + - kernel_sizes (List[int]): List of kernel sizes for the convolutional layers. + - bias (bool, optional): If True, adds a learnable bias to the convolutional layers. Default is True. + + Shapes: + - Input: (B, Fi, T, Ci) where + B is batch size, + Fi is the number of input subbands, + T is sequence length, and + Ci is the number of input channels. + - Output: (B, Fi+1, T, Ci+1) where + B is batch size, + Fi+1 is the number of output subbands, + T is sequence length, + Ci+1 is the number of output channels. + """ + + def __init__( + self, + subband_interval: Tuple[float, float], + input_dim: int, + output_dim: int, + downsample_stride: int, + n_conv_modules: int, + kernel_sizes: List[int], + bias: bool = True, + ): + """ + Initializes SDLayer with subband interval, input dimension, + output dimension, downsample stride, number of convolutional modules, kernel sizes, and bias. + """ + super().__init__() + self.subband_interval = subband_interval + self.downsample = Downsample(input_dim, output_dim, downsample_stride) + self.activation = nn.GELU() + conv_modules = [ + ConvolutionModule( + input_dim=output_dim, + hidden_dim=output_dim // 4, + kernel_sizes=kernel_sizes, + bias=bias, + ) + for _ in range(n_conv_modules) + ] + self.conv_modules = nn.Sequential(*conv_modules) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the SDLayer. + + Args: + - x (torch.Tensor): Input tensor of shape (B, Fi, T, Ci). + + Returns: + - torch.Tensor: Output tensor of shape (B, Fi+1, T, Ci+1). + """ + B, F, T, C = x.shape + x = x[:, int(self.subband_interval[0] * F) : int(self.subband_interval[1] * F)] + x = x.permute(0, 3, 1, 2) + x = self.downsample(x) + x = self.activation(x) + x = x.permute(0, 2, 3, 1) + + B, F, T, C = x.shape + x = x.reshape((B * F), T, C) + x = self.conv_modules(x) + x = x.reshape(B, F, T, C) + + return x + + +class SDBlock(nn.Module): + """ + SDBlock class implements a block with subband decomposition layers and global convolution. + + Args: + - input_dim (int): Dimensionality of the input channels. + - output_dim (int): Dimensionality of the output channels. + - bandsplit_ratios (List[float]): List of ratios for splitting the frequency bands. + - downsample_strides (List[int]): List of stride values for downsampling in each subband layer. + - n_conv_modules (List[int]): List specifying the number of convolutional modules in each subband layer. + - kernel_sizes (List[int], optional): List of kernel sizes for the convolutional layers. Default is None. + + Shapes: + - Input: (B, Fi, T, Ci) where + B is batch size, + Fi is the number of input subbands, + T is sequence length, + Ci is the number of input channels. + - Output: (B, Fi+1, T, Ci+1) where + B is batch size, + Fi+1 is the number of output subbands, + T is sequence length, + Ci+1 is the number of output channels. + """ + + def __init__( + self, + input_dim: int, + output_dim: int, + bandsplit_ratios: List[float], + downsample_strides: List[int], + n_conv_modules: List[int], + kernel_sizes: List[int] = None, + ): + """ + Initializes SDBlock with input dimension, output dimension, band split ratios, downsample strides, number of convolutional modules, and kernel sizes. + """ + super().__init__() + if kernel_sizes is None: + kernel_sizes = [3, 3, 1] + assert sum(bandsplit_ratios) == 1, "The split ratios must sum up to 1." + subband_intervals = create_intervals(bandsplit_ratios) + self.sd_layers = nn.ModuleList( + SDLayer( + input_dim=input_dim, + output_dim=output_dim, + subband_interval=sbi, + downsample_stride=dss, + n_conv_modules=ncm, + kernel_sizes=kernel_sizes, + ) + for sbi, dss, ncm in zip( + subband_intervals, downsample_strides, n_conv_modules + ) + ) + self.global_conv2d = nn.Conv2d(output_dim, output_dim, 1, 1) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Performs forward pass through the SDBlock. + + Args: + - x (torch.Tensor): Input tensor of shape (B, Fi, T, Ci). + + Returns: + - Tuple[torch.Tensor, torch.Tensor]: Output tensor and skip connection tensor. + """ + x_skip = torch.concat([layer(x) for layer in self.sd_layers], dim=1) + x = self.global_conv2d(x_skip.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + return x, x_skip diff --git a/programs/music_separation_code/models/scnet_unofficial/modules/su_decoder.py b/programs/music_separation_code/models/scnet_unofficial/modules/su_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..660c1fa6cbfd9b43bed73204a0bb6593524de272 --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/modules/su_decoder.py @@ -0,0 +1,241 @@ +from typing import List, Tuple + +import torch +import torch.nn as nn + +from models.scnet_unofficial.utils import get_convtranspose_output_padding + + +class FusionLayer(nn.Module): + """ + FusionLayer class implements a module for fusing two input tensors using convolutional operations. + + Args: + - input_dim (int): Dimensionality of the input channels. + - kernel_size (int, optional): Kernel size for the convolutional layer. Default is 3. + - stride (int, optional): Stride value for the convolutional layer. Default is 1. + - padding (int, optional): Padding value for the convolutional layer. Default is 1. + + Shapes: + - Input: (B, F, T, C) and (B, F, T, C) where + B is batch size, + F is the number of features, + T is sequence length, + C is input dimensionality. + - Output: (B, F, T, C) where + B is batch size, + F is the number of features, + T is sequence length, + C is input dimensionality. + """ + + def __init__( + self, input_dim: int, kernel_size: int = 3, stride: int = 1, padding: int = 1 + ): + """ + Initializes FusionLayer with input dimension, kernel size, stride, and padding. + """ + super().__init__() + self.conv = nn.Conv2d( + input_dim * 2, + input_dim * 2, + kernel_size=(kernel_size, 1), + stride=(stride, 1), + padding=(padding, 0), + ) + self.activation = nn.GLU() + + def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the FusionLayer. + + Args: + - x1 (torch.Tensor): First input tensor of shape (B, F, T, C). + - x2 (torch.Tensor): Second input tensor of shape (B, F, T, C). + + Returns: + - torch.Tensor: Output tensor of shape (B, F, T, C). + """ + x = x1 + x2 + x = x.repeat(1, 1, 1, 2) + x = self.conv(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + x = self.activation(x) + return x + + +class Upsample(nn.Module): + """ + Upsample class implements a module for upsampling input tensors using transposed 2D convolution. + + Args: + - input_dim (int): Dimensionality of the input channels. + - output_dim (int): Dimensionality of the output channels. + - stride (int): Stride value for the transposed convolution operation. + - output_padding (int): Output padding value for the transposed convolution operation. + + Shapes: + - Input: (B, C_in, F, T) where + B is batch size, + C_in is the number of input channels, + F is the frequency dimension, + T is the time dimension. + - Output: (B, C_out, F * stride + output_padding, T) where + B is batch size, + C_out is the number of output channels, + F * stride + output_padding is the upsampled frequency dimension. + """ + + def __init__( + self, input_dim: int, output_dim: int, stride: int, output_padding: int + ): + """ + Initializes Upsample with input dimension, output dimension, stride, and output padding. + """ + super().__init__() + self.conv = nn.ConvTranspose2d( + input_dim, output_dim, 1, (stride, 1), output_padding=(output_padding, 0) + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the Upsample module. + + Args: + - x (torch.Tensor): Input tensor of shape (B, C_in, F, T). + + Returns: + - torch.Tensor: Output tensor of shape (B, C_out, F * stride + output_padding, T). + """ + return self.conv(x) + + +class SULayer(nn.Module): + """ + SULayer class implements a subband upsampling layer using transposed convolution. + + Args: + - input_dim (int): Dimensionality of the input channels. + - output_dim (int): Dimensionality of the output channels. + - upsample_stride (int): Stride value for the upsampling operation. + - subband_shape (int): Shape of the subband. + - sd_interval (Tuple[int, int]): Start and end indices of the subband interval. + + Shapes: + - Input: (B, F, T, C) where + B is batch size, + F is the number of features, + T is sequence length, + C is input dimensionality. + - Output: (B, F, T, C) where + B is batch size, + F is the number of features, + T is sequence length, + C is input dimensionality. + """ + + def __init__( + self, + input_dim: int, + output_dim: int, + upsample_stride: int, + subband_shape: int, + sd_interval: Tuple[int, int], + ): + """ + Initializes SULayer with input dimension, output dimension, upsample stride, subband shape, and subband interval. + """ + super().__init__() + sd_shape = sd_interval[1] - sd_interval[0] + upsample_output_padding = get_convtranspose_output_padding( + input_shape=sd_shape, output_shape=subband_shape, stride=upsample_stride + ) + self.upsample = Upsample( + input_dim=input_dim, + output_dim=output_dim, + stride=upsample_stride, + output_padding=upsample_output_padding, + ) + self.sd_interval = sd_interval + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the SULayer. + + Args: + - x (torch.Tensor): Input tensor of shape (B, F, T, C). + + Returns: + - torch.Tensor: Output tensor of shape (B, F, T, C). + """ + x = x[:, self.sd_interval[0] : self.sd_interval[1]] + x = x.permute(0, 3, 1, 2) + x = self.upsample(x) + x = x.permute(0, 2, 3, 1) + return x + + +class SUBlock(nn.Module): + """ + SUBlock class implements a block with fusion layer and subband upsampling layers. + + Args: + - input_dim (int): Dimensionality of the input channels. + - output_dim (int): Dimensionality of the output channels. + - upsample_strides (List[int]): List of stride values for the upsampling operations. + - subband_shapes (List[int]): List of shapes for the subbands. + - sd_intervals (List[Tuple[int, int]]): List of intervals for subband decomposition. + + Shapes: + - Input: (B, Fi-1, T, Ci-1) and (B, Fi-1, T, Ci-1) where + B is batch size, + Fi-1 is the number of input subbands, + T is sequence length, + Ci-1 is the number of input channels. + - Output: (B, Fi, T, Ci) where + B is batch size, + Fi is the number of output subbands, + T is sequence length, + Ci is the number of output channels. + """ + + def __init__( + self, + input_dim: int, + output_dim: int, + upsample_strides: List[int], + subband_shapes: List[int], + sd_intervals: List[Tuple[int, int]], + ): + """ + Initializes SUBlock with input dimension, output dimension, + upsample strides, subband shapes, and subband intervals. + """ + super().__init__() + self.fusion_layer = FusionLayer(input_dim=input_dim) + self.su_layers = nn.ModuleList( + SULayer( + input_dim=input_dim, + output_dim=output_dim, + upsample_stride=uss, + subband_shape=sbs, + sd_interval=sdi, + ) + for i, (uss, sbs, sdi) in enumerate( + zip(upsample_strides, subband_shapes, sd_intervals) + ) + ) + + def forward(self, x: torch.Tensor, x_skip: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the SUBlock. + + Args: + - x (torch.Tensor): Input tensor of shape (B, Fi-1, T, Ci-1). + - x_skip (torch.Tensor): Input skip connection tensor of shape (B, Fi-1, T, Ci-1). + + Returns: + - torch.Tensor: Output tensor of shape (B, Fi, T, Ci). + """ + x = self.fusion_layer(x, x_skip) + x = torch.concat([layer(x) for layer in self.su_layers], dim=1) + return x diff --git a/programs/music_separation_code/models/scnet_unofficial/scnet.py b/programs/music_separation_code/models/scnet_unofficial/scnet.py new file mode 100644 index 0000000000000000000000000000000000000000..d6dcf7285da2358b6bf2dcd2b9a177fed6022a0d --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/scnet.py @@ -0,0 +1,246 @@ +""" +SCNet - great paper, great implementation +https://arxiv.org/pdf/2401.13276.pdf +https://github.com/amanteur/SCNet-PyTorch +""" + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio + +from models.scnet_unofficial.modules import DualPathRNN, SDBlock, SUBlock +from models.scnet_unofficial.utils import compute_sd_layer_shapes, compute_gcr + +from einops import rearrange, pack, unpack +from functools import partial + +from beartype.typing import Tuple, Optional, List, Callable +from beartype import beartype + + +def exists(val): + return val is not None + + +def default(v, d): + return v if exists(v) else d + + +def pack_one(t, pattern): + return pack([t], pattern) + + +def unpack_one(t, ps, pattern): + return unpack(t, ps, pattern)[0] + + +class RMSNorm(nn.Module): + def __init__(self, dim): + super().__init__() + self.scale = dim**0.5 + self.gamma = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + return F.normalize(x, dim=-1) * self.scale * self.gamma + + +class BandSplit(nn.Module): + @beartype + def __init__(self, dim, dim_inputs: Tuple[int, ...]): + super().__init__() + self.dim_inputs = dim_inputs + self.to_features = ModuleList([]) + + for dim_in in dim_inputs: + net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim)) + + self.to_features.append(net) + + def forward(self, x): + x = x.split(self.dim_inputs, dim=-1) + + outs = [] + for split_input, to_feature in zip(x, self.to_features): + split_output = to_feature(split_input) + outs.append(split_output) + + return torch.stack(outs, dim=-2) + + +class SCNet(nn.Module): + """ + SCNet class implements a source separation network, + which explicitly split the spectrogram of the mixture into several subbands + and introduce a sparsity-based encoder to model different frequency bands. + + Paper: "SCNET: SPARSE COMPRESSION NETWORK FOR MUSIC SOURCE SEPARATION" + Authors: Weinan Tong, Jiaxu Zhu et al. + Link: https://arxiv.org/abs/2401.13276.pdf + + Args: + - n_fft (int): Number of FFTs to determine the frequency dimension of the input. + - dims (List[int]): List of channel dimensions for each block. + - bandsplit_ratios (List[float]): List of ratios for splitting the frequency bands. + - downsample_strides (List[int]): List of stride values for downsampling in each block. + - n_conv_modules (List[int]): List specifying the number of convolutional modules in each block. + - n_rnn_layers (int): Number of recurrent layers in the dual path RNN. + - rnn_hidden_dim (int): Dimensionality of the hidden state in the dual path RNN. + - n_sources (int, optional): Number of sources to be separated. Default is 4. + + Shapes: + - Input: (B, C, T) where + B is batch size, + C is channel dim (mono / stereo), + T is time dim + - Output: (B, N, C, T) where + B is batch size, + N is the number of sources. + C is channel dim (mono / stereo), + T is sequence length, + """ + + @beartype + def __init__( + self, + n_fft: int, + dims: List[int], + bandsplit_ratios: List[float], + downsample_strides: List[int], + n_conv_modules: List[int], + n_rnn_layers: int, + rnn_hidden_dim: int, + n_sources: int = 4, + hop_length: int = 1024, + win_length: int = 4096, + stft_window_fn: Optional[Callable] = None, + stft_normalized: bool = False, + **kwargs, + ): + """ + Initializes SCNet with input parameters. + """ + super().__init__() + self.assert_input_data( + bandsplit_ratios, + downsample_strides, + n_conv_modules, + ) + + n_blocks = len(dims) - 1 + n_freq_bins = n_fft // 2 + 1 + subband_shapes, sd_intervals = compute_sd_layer_shapes( + input_shape=n_freq_bins, + bandsplit_ratios=bandsplit_ratios, + downsample_strides=downsample_strides, + n_layers=n_blocks, + ) + self.sd_blocks = nn.ModuleList( + SDBlock( + input_dim=dims[i], + output_dim=dims[i + 1], + bandsplit_ratios=bandsplit_ratios, + downsample_strides=downsample_strides, + n_conv_modules=n_conv_modules, + ) + for i in range(n_blocks) + ) + self.dualpath_blocks = DualPathRNN( + n_layers=n_rnn_layers, + input_dim=dims[-1], + hidden_dim=rnn_hidden_dim, + **kwargs, + ) + self.su_blocks = nn.ModuleList( + SUBlock( + input_dim=dims[i + 1], + output_dim=dims[i] if i != 0 else dims[i] * n_sources, + subband_shapes=subband_shapes[i], + sd_intervals=sd_intervals[i], + upsample_strides=downsample_strides, + ) + for i in reversed(range(n_blocks)) + ) + self.gcr = compute_gcr(subband_shapes) + + self.stft_kwargs = dict( + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + normalized=stft_normalized, + ) + + self.stft_window_fn = partial( + default(stft_window_fn, torch.hann_window), win_length + ) + self.n_sources = n_sources + self.hop_length = hop_length + + @staticmethod + def assert_input_data(*args): + """ + Asserts that the shapes of input features are equal. + """ + for arg1 in args: + for arg2 in args: + if len(arg1) != len(arg2): + raise ValueError( + f"Shapes of input features {arg1} and {arg2} are not equal." + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Performs forward pass through the SCNet. + + Args: + - x (torch.Tensor): Input tensor of shape (B, C, T). + + Returns: + - torch.Tensor: Output tensor of shape (B, N, C, T). + """ + + device = x.device + stft_window = self.stft_window_fn(device=device) + + if x.ndim == 2: + x = rearrange(x, "b t -> b 1 t") + + c = x.shape[1] + + stft_pad = self.hop_length - x.shape[-1] % self.hop_length + x = F.pad(x, (0, stft_pad)) + + # stft + x, ps = pack_one(x, "* t") + x = torch.stft(x, **self.stft_kwargs, window=stft_window, return_complex=True) + x = torch.view_as_real(x) + x = unpack_one(x, ps, "* c f t") + x = rearrange(x, "b c f t r -> b f t (c r)") + + # encoder part + x_skips = [] + for sd_block in self.sd_blocks: + x, x_skip = sd_block(x) + x_skips.append(x_skip) + + # separation part + x = self.dualpath_blocks(x) + + # decoder part + for su_block, x_skip in zip(self.su_blocks, reversed(x_skips)): + x = su_block(x, x_skip) + + # istft + x = rearrange(x, "b f t (c r n) -> b n c f t r", c=c, n=self.n_sources, r=2) + x = x.contiguous() + + x = torch.view_as_complex(x) + x = rearrange(x, "b n c f t -> (b n c) f t") + x = torch.istft(x, **self.stft_kwargs, window=stft_window, return_complex=False) + x = rearrange(x, "(b n c) t -> b n c t", c=c, n=self.n_sources) + + x = x[..., :-stft_pad] + + return x diff --git a/programs/music_separation_code/models/scnet_unofficial/utils.py b/programs/music_separation_code/models/scnet_unofficial/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..64043812fbd3e168d33b2cf6c775e97eee580a85 --- /dev/null +++ b/programs/music_separation_code/models/scnet_unofficial/utils.py @@ -0,0 +1,135 @@ +""" +SCNet - great paper, great implementation +https://arxiv.org/pdf/2401.13276.pdf +https://github.com/amanteur/SCNet-PyTorch +""" + +from typing import List, Tuple, Union + +import torch + + +def create_intervals( + splits: List[Union[float, int]] +) -> List[Union[Tuple[float, float], Tuple[int, int]]]: + """ + Create intervals based on splits provided. + + Args: + - splits (List[Union[float, int]]): List of floats or integers representing splits. + + Returns: + - List[Union[Tuple[float, float], Tuple[int, int]]]: List of tuples representing intervals. + """ + start = 0 + return [(start, start := start + split) for split in splits] + + +def get_conv_output_shape( + input_shape: int, + kernel_size: int = 1, + padding: int = 0, + dilation: int = 1, + stride: int = 1, +) -> int: + """ + Compute the output shape of a convolutional layer. + + Args: + - input_shape (int): Input shape. + - kernel_size (int, optional): Kernel size of the convolution. Default is 1. + - padding (int, optional): Padding size. Default is 0. + - dilation (int, optional): Dilation factor. Default is 1. + - stride (int, optional): Stride value. Default is 1. + + Returns: + - int: Output shape. + """ + return int( + (input_shape + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1 + ) + + +def get_convtranspose_output_padding( + input_shape: int, + output_shape: int, + kernel_size: int = 1, + padding: int = 0, + dilation: int = 1, + stride: int = 1, +) -> int: + """ + Compute the output padding for a convolution transpose operation. + + Args: + - input_shape (int): Input shape. + - output_shape (int): Desired output shape. + - kernel_size (int, optional): Kernel size of the convolution. Default is 1. + - padding (int, optional): Padding size. Default is 0. + - dilation (int, optional): Dilation factor. Default is 1. + - stride (int, optional): Stride value. Default is 1. + + Returns: + - int: Output padding. + """ + return ( + output_shape + - (input_shape - 1) * stride + + 2 * padding + - dilation * (kernel_size - 1) + - 1 + ) + + +def compute_sd_layer_shapes( + input_shape: int, + bandsplit_ratios: List[float], + downsample_strides: List[int], + n_layers: int, +) -> Tuple[List[List[int]], List[List[Tuple[int, int]]]]: + """ + Compute the shapes for the subband layers. + + Args: + - input_shape (int): Input shape. + - bandsplit_ratios (List[float]): Ratios for splitting the frequency bands. + - downsample_strides (List[int]): Strides for downsampling in each layer. + - n_layers (int): Number of layers. + + Returns: + - Tuple[List[List[int]], List[List[Tuple[int, int]]]]: Tuple containing subband shapes and convolution shapes. + """ + bandsplit_shapes_list = [] + conv2d_shapes_list = [] + for _ in range(n_layers): + bandsplit_intervals = create_intervals(bandsplit_ratios) + bandsplit_shapes = [ + int(right * input_shape) - int(left * input_shape) + for left, right in bandsplit_intervals + ] + conv2d_shapes = [ + get_conv_output_shape(bs, stride=ds) + for bs, ds in zip(bandsplit_shapes, downsample_strides) + ] + input_shape = sum(conv2d_shapes) + bandsplit_shapes_list.append(bandsplit_shapes) + conv2d_shapes_list.append(create_intervals(conv2d_shapes)) + + return bandsplit_shapes_list, conv2d_shapes_list + + +def compute_gcr(subband_shapes: List[List[int]]) -> float: + """ + Compute the global compression ratio. + + Args: + - subband_shapes (List[List[int]]): List of subband shapes. + + Returns: + - float: Global compression ratio. + """ + t = torch.Tensor(subband_shapes) + gcr = torch.stack( + [(1 - t[i + 1] / t[i]).mean() for i in range(0, len(t) - 1)] + ).mean() + return float(gcr) diff --git a/programs/music_separation_code/models/segm_models.py b/programs/music_separation_code/models/segm_models.py new file mode 100644 index 0000000000000000000000000000000000000000..537d94afdff0f8247acf01a22613828750412603 --- /dev/null +++ b/programs/music_separation_code/models/segm_models.py @@ -0,0 +1,255 @@ +import torch +import torch.nn as nn +import segmentation_models_pytorch as smp + + +class STFT: + def __init__(self, config): + self.n_fft = config.n_fft + self.hop_length = config.hop_length + self.window = torch.hann_window(window_length=self.n_fft, periodic=True) + self.dim_f = config.dim_f + + def __call__(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-2] + c, t = x.shape[-2:] + x = x.reshape([-1, t]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop_length, + window=window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape( + [*batch_dims, c * 2, -1, x.shape[-1]] + ) + return x[..., : self.dim_f, :] + + def inverse(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-3] + c, f, t = x.shape[-3:] + n = self.n_fft // 2 + 1 + f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device) + x = torch.cat([x, f_pad], -2) + x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t]) + x = x.permute([0, 2, 3, 1]) + x = x[..., 0] + x[..., 1] * 1.0j + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True + ) + x = x.reshape([*batch_dims, 2, -1]) + return x + + +def get_act(act_type): + if act_type == "gelu": + return nn.GELU() + elif act_type == "relu": + return nn.ReLU() + elif act_type[:3] == "elu": + alpha = float(act_type.replace("elu", "")) + return nn.ELU(alpha) + else: + raise Exception + + +def get_decoder(config, c): + decoder = None + decoder_options = dict() + if config.model.decoder_type == "unet": + try: + decoder_options = dict(config.decoder_unet) + except: + pass + decoder = smp.Unet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "fpn": + try: + decoder_options = dict(config.decoder_fpn) + except: + pass + decoder = smp.FPN( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "unet++": + try: + decoder_options = dict(config.decoder_unet_plus_plus) + except: + pass + decoder = smp.UnetPlusPlus( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "manet": + try: + decoder_options = dict(config.decoder_manet) + except: + pass + decoder = smp.MAnet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "linknet": + try: + decoder_options = dict(config.decoder_linknet) + except: + pass + decoder = smp.Linknet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "pspnet": + try: + decoder_options = dict(config.decoder_pspnet) + except: + pass + decoder = smp.PSPNet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "pspnet": + try: + decoder_options = dict(config.decoder_pspnet) + except: + pass + decoder = smp.PSPNet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "pan": + try: + decoder_options = dict(config.decoder_pan) + except: + pass + decoder = smp.PAN( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "deeplabv3": + try: + decoder_options = dict(config.decoder_deeplabv3) + except: + pass + decoder = smp.DeepLabV3( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "deeplabv3plus": + try: + decoder_options = dict(config.decoder_deeplabv3plus) + except: + pass + decoder = smp.DeepLabV3Plus( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + return decoder + + +class Segm_Models_Net(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + act = get_act(act_type=config.model.act) + + self.num_target_instruments = ( + 1 if config.training.target_instrument else len(config.training.instruments) + ) + self.num_subbands = config.model.num_subbands + + dim_c = self.num_subbands * config.audio.num_channels * 2 + c = config.model.num_channels + f = config.audio.dim_f // self.num_subbands + + self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False) + + self.unet_model = get_decoder(config, c) + + self.final_conv = nn.Sequential( + nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False), + act, + nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False), + ) + + self.stft = STFT(config.audio) + + def cac2cws(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c, k, f // k, t) + x = x.reshape(b, c * k, f // k, t) + return x + + def cws2cac(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c // k, k, f, t) + x = x.reshape(b, c // k, f * k, t) + return x + + def forward(self, x): + + x = self.stft(x) + + mix = x = self.cac2cws(x) + + first_conv_out = x = self.first_conv(x) + + x = x.transpose(-1, -2) + + x = self.unet_model(x) + + x = x.transpose(-1, -2) + + x = x * first_conv_out # reduce artifacts + + x = self.final_conv(torch.cat([mix, x], 1)) + + x = self.cws2cac(x) + + if self.num_target_instruments > 1: + b, c, f, t = x.shape + x = x.reshape(b, self.num_target_instruments, -1, f, t) + + x = self.stft.inverse(x) + return x diff --git a/programs/music_separation_code/models/torchseg_models.py b/programs/music_separation_code/models/torchseg_models.py new file mode 100644 index 0000000000000000000000000000000000000000..92fec692666dd4caa91d999e96288d7dbbc85946 --- /dev/null +++ b/programs/music_separation_code/models/torchseg_models.py @@ -0,0 +1,255 @@ +import torch +import torch.nn as nn +import torchseg as smp + + +class STFT: + def __init__(self, config): + self.n_fft = config.n_fft + self.hop_length = config.hop_length + self.window = torch.hann_window(window_length=self.n_fft, periodic=True) + self.dim_f = config.dim_f + + def __call__(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-2] + c, t = x.shape[-2:] + x = x.reshape([-1, t]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop_length, + window=window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape( + [*batch_dims, c * 2, -1, x.shape[-1]] + ) + return x[..., : self.dim_f, :] + + def inverse(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-3] + c, f, t = x.shape[-3:] + n = self.n_fft // 2 + 1 + f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device) + x = torch.cat([x, f_pad], -2) + x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t]) + x = x.permute([0, 2, 3, 1]) + x = x[..., 0] + x[..., 1] * 1.0j + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True + ) + x = x.reshape([*batch_dims, 2, -1]) + return x + + +def get_act(act_type): + if act_type == "gelu": + return nn.GELU() + elif act_type == "relu": + return nn.ReLU() + elif act_type[:3] == "elu": + alpha = float(act_type.replace("elu", "")) + return nn.ELU(alpha) + else: + raise Exception + + +def get_decoder(config, c): + decoder = None + decoder_options = dict() + if config.model.decoder_type == "unet": + try: + decoder_options = dict(config.decoder_unet) + except: + pass + decoder = smp.Unet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "fpn": + try: + decoder_options = dict(config.decoder_fpn) + except: + pass + decoder = smp.FPN( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "unet++": + try: + decoder_options = dict(config.decoder_unet_plus_plus) + except: + pass + decoder = smp.UnetPlusPlus( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "manet": + try: + decoder_options = dict(config.decoder_manet) + except: + pass + decoder = smp.MAnet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "linknet": + try: + decoder_options = dict(config.decoder_linknet) + except: + pass + decoder = smp.Linknet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "pspnet": + try: + decoder_options = dict(config.decoder_pspnet) + except: + pass + decoder = smp.PSPNet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "pspnet": + try: + decoder_options = dict(config.decoder_pspnet) + except: + pass + decoder = smp.PSPNet( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "pan": + try: + decoder_options = dict(config.decoder_pan) + except: + pass + decoder = smp.PAN( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "deeplabv3": + try: + decoder_options = dict(config.decoder_deeplabv3) + except: + pass + decoder = smp.DeepLabV3( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + elif config.model.decoder_type == "deeplabv3plus": + try: + decoder_options = dict(config.decoder_deeplabv3plus) + except: + pass + decoder = smp.DeepLabV3Plus( + encoder_name=config.model.encoder_name, + encoder_weights="imagenet", + in_channels=c, + classes=c, + **decoder_options, + ) + return decoder + + +class Torchseg_Net(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + act = get_act(act_type=config.model.act) + + self.num_target_instruments = ( + 1 if config.training.target_instrument else len(config.training.instruments) + ) + self.num_subbands = config.model.num_subbands + + dim_c = self.num_subbands * config.audio.num_channels * 2 + c = config.model.num_channels + f = config.audio.dim_f // self.num_subbands + + self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False) + + self.unet_model = get_decoder(config, c) + + self.final_conv = nn.Sequential( + nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False), + act, + nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False), + ) + + self.stft = STFT(config.audio) + + def cac2cws(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c, k, f // k, t) + x = x.reshape(b, c * k, f // k, t) + return x + + def cws2cac(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c // k, k, f, t) + x = x.reshape(b, c // k, f * k, t) + return x + + def forward(self, x): + + x = self.stft(x) + + mix = x = self.cac2cws(x) + + first_conv_out = x = self.first_conv(x) + + x = x.transpose(-1, -2) + + x = self.unet_model(x) + + x = x.transpose(-1, -2) + + x = x * first_conv_out # reduce artifacts + + x = self.final_conv(torch.cat([mix, x], 1)) + + x = self.cws2cac(x) + + if self.num_target_instruments > 1: + b, c, f, t = x.shape + x = x.reshape(b, self.num_target_instruments, -1, f, t) + + x = self.stft.inverse(x) + return x diff --git a/programs/music_separation_code/models/upernet_swin_transformers.py b/programs/music_separation_code/models/upernet_swin_transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..27f32f41bb37c1b2ed9db332bb2ddaf7b8ef43df --- /dev/null +++ b/programs/music_separation_code/models/upernet_swin_transformers.py @@ -0,0 +1,250 @@ +from functools import partial +import torch +import torch.nn as nn +from transformers import UperNetForSemanticSegmentation + + +class STFT: + def __init__(self, config): + self.n_fft = config.n_fft + self.hop_length = config.hop_length + self.window = torch.hann_window(window_length=self.n_fft, periodic=True) + self.dim_f = config.dim_f + + def __call__(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-2] + c, t = x.shape[-2:] + x = x.reshape([-1, t]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop_length, + window=window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([*batch_dims, c, 2, -1, x.shape[-1]]).reshape( + [*batch_dims, c * 2, -1, x.shape[-1]] + ) + return x[..., : self.dim_f, :] + + def inverse(self, x): + window = self.window.to(x.device) + batch_dims = x.shape[:-3] + c, f, t = x.shape[-3:] + n = self.n_fft // 2 + 1 + f_pad = torch.zeros([*batch_dims, c, n - f, t]).to(x.device) + x = torch.cat([x, f_pad], -2) + x = x.reshape([*batch_dims, c // 2, 2, n, t]).reshape([-1, 2, n, t]) + x = x.permute([0, 2, 3, 1]) + x = x[..., 0] + x[..., 1] * 1.0j + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop_length, window=window, center=True + ) + x = x.reshape([*batch_dims, 2, -1]) + return x + + +def get_norm(norm_type): + def norm(c, norm_type): + if norm_type == "BatchNorm": + return nn.BatchNorm2d(c) + elif norm_type == "InstanceNorm": + return nn.InstanceNorm2d(c, affine=True) + elif "GroupNorm" in norm_type: + g = int(norm_type.replace("GroupNorm", "")) + return nn.GroupNorm(num_groups=g, num_channels=c) + else: + return nn.Identity() + + return partial(norm, norm_type=norm_type) + + +def get_act(act_type): + if act_type == "gelu": + return nn.GELU() + elif act_type == "relu": + return nn.ReLU() + elif act_type[:3] == "elu": + alpha = float(act_type.replace("elu", "")) + return nn.ELU(alpha) + else: + raise Exception + + +class Upscale(nn.Module): + def __init__(self, in_c, out_c, scale, norm, act): + super().__init__() + self.conv = nn.Sequential( + norm(in_c), + act, + nn.ConvTranspose2d( + in_channels=in_c, + out_channels=out_c, + kernel_size=scale, + stride=scale, + bias=False, + ), + ) + + def forward(self, x): + return self.conv(x) + + +class Downscale(nn.Module): + def __init__(self, in_c, out_c, scale, norm, act): + super().__init__() + self.conv = nn.Sequential( + norm(in_c), + act, + nn.Conv2d( + in_channels=in_c, + out_channels=out_c, + kernel_size=scale, + stride=scale, + bias=False, + ), + ) + + def forward(self, x): + return self.conv(x) + + +class TFC_TDF(nn.Module): + def __init__(self, in_c, c, l, f, bn, norm, act): + super().__init__() + + self.blocks = nn.ModuleList() + for i in range(l): + block = nn.Module() + + block.tfc1 = nn.Sequential( + norm(in_c), + act, + nn.Conv2d(in_c, c, 3, 1, 1, bias=False), + ) + block.tdf = nn.Sequential( + norm(c), + act, + nn.Linear(f, f // bn, bias=False), + norm(c), + act, + nn.Linear(f // bn, f, bias=False), + ) + block.tfc2 = nn.Sequential( + norm(c), + act, + nn.Conv2d(c, c, 3, 1, 1, bias=False), + ) + block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False) + + self.blocks.append(block) + in_c = c + + def forward(self, x): + for block in self.blocks: + s = block.shortcut(x) + x = block.tfc1(x) + x = x + block.tdf(x) + x = block.tfc2(x) + x = x + s + return x + + +class Swin_UperNet_Model(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + act = get_act(act_type=config.model.act) + + self.num_target_instruments = ( + 1 if config.training.target_instrument else len(config.training.instruments) + ) + self.num_subbands = config.model.num_subbands + + dim_c = self.num_subbands * config.audio.num_channels * 2 + c = config.model.num_channels + f = config.audio.dim_f // self.num_subbands + + self.first_conv = nn.Conv2d(dim_c, c, 1, 1, 0, bias=False) + + self.swin_upernet_model = UperNetForSemanticSegmentation.from_pretrained( + "openmmlab/upernet-swin-large" + ) + + self.swin_upernet_model.auxiliary_head.classifier = nn.Conv2d( + 256, c, kernel_size=(1, 1), stride=(1, 1) + ) + self.swin_upernet_model.decode_head.classifier = nn.Conv2d( + 512, c, kernel_size=(1, 1), stride=(1, 1) + ) + self.swin_upernet_model.backbone.embeddings.patch_embeddings.projection = ( + nn.Conv2d(c, 192, kernel_size=(4, 4), stride=(4, 4)) + ) + + self.final_conv = nn.Sequential( + nn.Conv2d(c + dim_c, c, 1, 1, 0, bias=False), + act, + nn.Conv2d(c, self.num_target_instruments * dim_c, 1, 1, 0, bias=False), + ) + + self.stft = STFT(config.audio) + + def cac2cws(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c, k, f // k, t) + x = x.reshape(b, c * k, f // k, t) + return x + + def cws2cac(self, x): + k = self.num_subbands + b, c, f, t = x.shape + x = x.reshape(b, c // k, k, f, t) + x = x.reshape(b, c // k, f * k, t) + return x + + def forward(self, x): + + x = self.stft(x) + + mix = x = self.cac2cws(x) + + first_conv_out = x = self.first_conv(x) + + x = x.transpose(-1, -2) + + x = self.swin_upernet_model(x).logits + + x = x.transpose(-1, -2) + + x = x * first_conv_out # reduce artifacts + + x = self.final_conv(torch.cat([mix, x], 1)) + + x = self.cws2cac(x) + + if self.num_target_instruments > 1: + b, c, f, t = x.shape + x = x.reshape(b, self.num_target_instruments, -1, f, t) + + x = self.stft.inverse(x) + return x + + +if __name__ == "__main__": + model = UperNetForSemanticSegmentation.from_pretrained( + "./results/", ignore_mismatched_sizes=True + ) + print(model) + print(model.auxiliary_head.classifier) + print(model.decode_head.classifier) + + x = torch.zeros((2, 16, 512, 512), dtype=torch.float32) + res = model(x) + print(res.logits.shape) + model.save_pretrained("./results/") diff --git a/programs/music_separation_code/utils.py b/programs/music_separation_code/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f07c465cc0c0a0b80170189443493b06eb0ccc64 --- /dev/null +++ b/programs/music_separation_code/utils.py @@ -0,0 +1,258 @@ +# coding: utf-8 +__author__ = "Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/" + +import time +import numpy as np +import torch +import torch.nn as nn +import yaml +from ml_collections import ConfigDict +from omegaconf import OmegaConf +from tqdm import tqdm +from numpy.typing import NDArray +from typing import Dict + + +def get_model_from_config(model_type, config_path): + with open(config_path) as f: + if model_type == "htdemucs": + config = OmegaConf.load(config_path) + else: + config = ConfigDict(yaml.load(f, Loader=yaml.FullLoader)) + + if model_type == "mdx23c": + from models.mdx23c_tfc_tdf_v3 import TFC_TDF_net + + model = TFC_TDF_net(config) + elif model_type == "htdemucs": + from models.demucs4ht import get_model + + model = get_model(config) + elif model_type == "segm_models": + from models.segm_models import Segm_Models_Net + + model = Segm_Models_Net(config) + elif model_type == "torchseg": + from models.torchseg_models import Torchseg_Net + + model = Torchseg_Net(config) + elif model_type == "mel_band_roformer": + from models.bs_roformer import MelBandRoformer + + model = MelBandRoformer(**dict(config.model)) + elif model_type == "bs_roformer": + from models.bs_roformer import BSRoformer + + model = BSRoformer(**dict(config.model)) + elif model_type == "swin_upernet": + from models.upernet_swin_transformers import Swin_UperNet_Model + + model = Swin_UperNet_Model(config) + elif model_type == "bandit": + from models.bandit.core.model import MultiMaskMultiSourceBandSplitRNNSimple + + model = MultiMaskMultiSourceBandSplitRNNSimple(**config.model) + elif model_type == "bandit_v2": + from models.bandit_v2.bandit import Bandit + + model = Bandit(**config.kwargs) + elif model_type == "scnet_unofficial": + from models.scnet_unofficial import SCNet + + model = SCNet(**config.model) + elif model_type == "scnet": + from models.scnet import SCNet + + model = SCNet(**config.model) + else: + print("Unknown model: {}".format(model_type)) + model = None + + return model, config + + +def _getWindowingArray(window_size, fade_size): + fadein = torch.linspace(0, 1, fade_size) + fadeout = torch.linspace(1, 0, fade_size) + window = torch.ones(window_size) + window[-fade_size:] *= fadeout + window[:fade_size] *= fadein + return window + + +def demix_track(config, model, mix, device, pbar=False): + C = config.audio.chunk_size + N = config.inference.num_overlap + fade_size = C // 10 + step = int(C // N) + border = C - step + batch_size = config.inference.batch_size + + length_init = mix.shape[-1] + + # Do pad from the beginning and end to account floating window results better + if length_init > 2 * border and (border > 0): + mix = nn.functional.pad(mix, (border, border), mode="reflect") + + # windowingArray crossfades at segment boundaries to mitigate clicking artifacts + windowingArray = _getWindowingArray(C, fade_size) + + with torch.cuda.amp.autocast(enabled=config.training.use_amp): + use_amp = getattr(config.training, "use_amp", False) + with torch.inference_mode(): + if config.training.target_instrument is not None: + req_shape = (1,) + tuple(mix.shape) + else: + req_shape = (len(config.training.instruments),) + tuple(mix.shape) + + result = torch.zeros(req_shape, dtype=torch.float32) + counter = torch.zeros(req_shape, dtype=torch.float32) + i = 0 + batch_data = [] + batch_locations = [] + progress_bar = ( + tqdm(total=mix.shape[1], desc="Processing audio chunks", leave=False) + if pbar + else None + ) + + while i < mix.shape[1]: + # print(i, i + C, mix.shape[1]) + part = mix[:, i : i + C].to(device) + length = part.shape[-1] + if length < C: + if length > C // 2 + 1: + part = nn.functional.pad( + input=part, pad=(0, C - length), mode="reflect" + ) + else: + part = nn.functional.pad( + input=part, + pad=(0, C - length, 0, 0), + mode="constant", + value=0, + ) + batch_data.append(part) + batch_locations.append((i, length)) + i += step + + if len(batch_data) >= batch_size or (i >= mix.shape[1]): + arr = torch.stack(batch_data, dim=0) + x = model(arr) + + window = windowingArray + if i - step == 0: # First audio chunk, no fadein + window[:fade_size] = 1 + elif i >= mix.shape[1]: # Last audio chunk, no fadeout + window[-fade_size:] = 1 + + for j in range(len(batch_locations)): + start, l = batch_locations[j] + result[..., start : start + l] += ( + x[j][..., :l].cpu() * window[..., :l] + ) + counter[..., start : start + l] += window[..., :l] + + batch_data = [] + batch_locations = [] + + if progress_bar: + progress_bar.update(step) + + if progress_bar: + progress_bar.close() + + estimated_sources = result / counter + estimated_sources = estimated_sources.cpu().numpy() + np.nan_to_num(estimated_sources, copy=False, nan=0.0) + + if length_init > 2 * border and (border > 0): + # Remove pad + estimated_sources = estimated_sources[..., border:-border] + + if config.training.target_instrument is None: + return {k: v for k, v in zip(config.training.instruments, estimated_sources)} + else: + return { + k: v for k, v in zip([config.training.target_instrument], estimated_sources) + } + + +def demix_track_demucs(config, model, mix, device, pbar=False): + S = len(config.training.instruments) + C = config.training.samplerate * config.training.segment + N = config.inference.num_overlap + batch_size = config.inference.batch_size + step = C // N + # print(S, C, N, step, mix.shape, mix.device) + + with torch.cuda.amp.autocast(enabled=config.training.use_amp): + with torch.inference_mode(): + req_shape = (S,) + tuple(mix.shape) + result = torch.zeros(req_shape, dtype=torch.float32) + counter = torch.zeros(req_shape, dtype=torch.float32) + i = 0 + batch_data = [] + batch_locations = [] + progress_bar = ( + tqdm(total=mix.shape[1], desc="Processing audio chunks", leave=False) + if pbar + else None + ) + + while i < mix.shape[1]: + # print(i, i + C, mix.shape[1]) + part = mix[:, i : i + C].to(device) + length = part.shape[-1] + if length < C: + part = nn.functional.pad( + input=part, pad=(0, C - length, 0, 0), mode="constant", value=0 + ) + batch_data.append(part) + batch_locations.append((i, length)) + i += step + + if len(batch_data) >= batch_size or (i >= mix.shape[1]): + arr = torch.stack(batch_data, dim=0) + x = model(arr) + for j in range(len(batch_locations)): + start, l = batch_locations[j] + result[..., start : start + l] += x[j][..., :l].cpu() + counter[..., start : start + l] += 1.0 + batch_data = [] + batch_locations = [] + + if progress_bar: + progress_bar.update(step) + + if progress_bar: + progress_bar.close() + + estimated_sources = result / counter + estimated_sources = estimated_sources.cpu().numpy() + np.nan_to_num(estimated_sources, copy=False, nan=0.0) + + if S > 1: + return {k: v for k, v in zip(config.training.instruments, estimated_sources)} + else: + return estimated_sources + + +def sdr(references, estimates): + # compute SDR for one song + delta = 1e-7 # avoid numerical errors + num = np.sum(np.square(references), axis=(1, 2)) + den = np.sum(np.square(references - estimates), axis=(1, 2)) + num += delta + den += delta + return 10 * np.log10(num / den) + + +def demix( + config, model, mix: NDArray, device, pbar=False, model_type: str = None +) -> Dict[str, NDArray]: + mix = torch.tensor(mix, dtype=torch.float32) + if model_type == "htdemucs": + return demix_track_demucs(config, model, mix, device, pbar=pbar) + else: + return demix_track(config, model, mix, device, pbar=pbar) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..54918514387ea835bdc0bace1fc4e9fbc63b134c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,217 @@ +absl-py==2.1.0 +accelerate==0.34.0 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.0 +aiohttp==3.10.5 +aiosignal==1.3.1 +altair==5.4.1 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.4.0 +asteroid==0.7.0 +asteroid-filterbanks==0.4.0 +async-timeout==4.0.3 +attrs==24.2.0 +audio-separator==0.18.0 +audio_upscaler==0.1.4 +audiomentations==0.24.0 +audioread==3.0.1 +auraloss==0.4.0 +babel==2.16.0 +beartype==0.18.5 +beautifulsoup4==4.12.3 +bibtexparser==2.0.0b7 +bitsandbytes==0.43.3 +blinker==1.8.2 +cached-property==1.5.2 +certifi==2023.7.22 +cffi==1.17.0 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +clldutils==3.22.2 +cloudpickle==3.0.0 +colorama==0.4.6 +coloredlogs==15.0.1 +colorlog==6.8.2 +contextlib2==21.6.0 +contourpy==1.3.0 +csvw==3.3.0 +cycler==0.12.1 +Cython==3.0.11 +decorator==5.1.1 +demucs==4.0.0 +diffq==0.2.4 +dlinfo==1.2.1 +docker-pycreds==0.4.0 +dora_search==0.1.12 +edge-tts==6.1.9 +efficientnet_pytorch==0.7.1 +einops==0.8.0 +exceptiongroup==1.2.2 +faiss-cpu==1.7.3 +fastapi==0.112.2 +ffmpeg-python==0.2.0 +ffmpy==0.3.1 +filelock==3.15.4 +Flask==3.0.3 +flatbuffers==24.3.25 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +ftfy==6.2.3 +future==1.0.0 +gitdb==4.0.11 +GitPython==3.1.43 +gradio==4.36.0 +gradio_client==1.0.1 +grpcio==1.66.1 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.2 +huggingface-hub==0.24.6 +humanfriendly==10.0 +idna==3.8 +importlib_metadata==8.4.0 +importlib_resources==6.4.4 +isodate==0.6.1 +itsdangerous==2.2.0 +Jinja2==3.1.4 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2023.12.1 +julius==0.2.7 +kiwisolver==1.4.5 +lameenc==1.7.0 +language-tags==1.2.0 +lazy_loader==0.4 +libf0==1.0.2 +librosa==0.10.2.post1 +lightning-utilities==0.11.7 +llvmlite==0.40.1 +local-attention==1.9.14 +lxml==5.3.0 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.7.2 +mdurl==0.1.2 +mir_eval==0.7 +ml_collections==0.1.1 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +munch==4.0.0 +narwhals==1.6.0 +networkx==3.2.1 +noisereduce==3.0.2 +numba==0.57.0 +numpy==1.23.5 +omegaconf==2.2.3 +onnx==1.16.2 +onnx2torch==1.5.15 +onnxruntime-gpu==1.19.0 +openunmix==1.3.0 +orjson==3.10.7 +packaging==24.1 +pandas==2.2.2 +pb-bss-eval==0.0.2 +pedalboard==0.8.9 +pesq==0.0.4 +phonemizer==3.3.0 +pillow==10.4.0 +platformdirs==4.2.2 +pooch==1.8.2 +praat-parselmouth==0.4.4 +pretrainedmodels==0.7.4 +primePy==1.3 +prodigyopt==1.0 +progressbar==2.5 +protobuf==3.20.3 +psutil==6.0.0 +pycparser==2.22 +pydantic==2.8.2 +pydantic_core==2.20.1 +pydub==0.25.1 +Pygments==2.18.0 +pylatexenc==2.10 +pyparsing==3.0.9 +pypresence==4.3.0 +pyreadline3==3.4.1 +pystoi==0.4.1 +python-dateutil==2.9.0.post0 +python-multipart==0.0.9 +pytorch-lightning==2.4.0 +pytorch-ranger==0.1.1 +pytz==2024.1 +pyworld==0.3.4 +PyYAML==6.0.2 +rdflib==7.0.0 +referencing==0.35.1 +regex==2024.7.24 +requests==2.32.0 +resampy==0.4.3 +retrying==1.3.4 +rfc3986==1.5.0 +rich==13.8.0 +rotary-embedding-torch==0.6.5 +rpds-py==0.20.0 +ruff==0.6.3 +safetensors==0.4.4 +samplerate==0.1.0 +scikit-learn==1.5.1 +scipy==1.13.1 +segmentation-models-pytorch==0.3.3 +segments==2.2.1 +semantic-version==2.10.0 +sentry-sdk==2.13.0 +setproctitle==1.3.3 +shellingham==1.5.4 +six==1.16.0 +smmap==5.0.1 +sniffio==1.3.1 +soundfile==0.12.1 +soupsieve==2.6 +soxr==0.5.0.post1 +spafe==0.3.2 +starlette==0.38.4 +submitit==1.5.1 +sympy==1.13.2 +tabulate==0.9.0 +tensorboard==2.17.1 +tensorboard-data-server==0.7.2 +tensorboardX==2.6.2.2 +threadpoolctl==3.5.0 +timm==0.9.2 +tokenizers==0.15.2 +tomlkit==0.12.0 +torch==2.1.1 +torch-audiomentations==0.11.1 +torch-optimizer==0.1.0 +torch-pitch-shift==1.2.4 +torch-stoi==0.2.1 +torchaudio==2.1.1 +torchcrepe==0.0.23 +torchfcpe==0.0.4 +torchlibrosa==0.1.0 +torchmetrics==0.11.4 +torchseg==0.0.1a1 +torchvision==0.16.1 +tqdm==4.66.5 +transformers==4.35.2 +treetable==0.2.5 +typer==0.12.5 +typing_extensions==4.12.2 +tzdata==2024.1 +Unidecode==1.3.8 +uritemplate==4.1.1 +urllib3==2.2.2 +uvicorn==0.30.6 +wandb==0.17.8 +wcwidth==0.2.13 +websockets==11.0.3 +Werkzeug==3.0.4 +wget==3.2 +yarl==1.9.7 +zipp==3.20.1 +yt-dlp \ No newline at end of file diff --git a/run.bat b/run.bat new file mode 100644 index 0000000000000000000000000000000000000000..b4dbb4a07c7c99fb90d173a8a34c49b270d20075 --- /dev/null +++ b/run.bat @@ -0,0 +1,66 @@ +@echo off +setlocal +title RVC AI Cover Maker +set "principal=%cd%" +set "CONDA_ROOT_PREFIX=%UserProfile%\Miniconda3" +set "INSTALL_ENV_DIR=%principal%\env" +set "MINICONDA_DOWNLOAD_URL=https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Windows-x86_64.exe" +set "CONDA_EXECUTABLE=%CONDA_ROOT_PREFIX%\Scripts\conda.exe" +if not exist env ( + if not exist "%CONDA_EXECUTABLE%" ( + echo Miniconda not found. Starting download and installation... + echo Downloading Miniconda... + powershell -Command "& {Invoke-WebRequest -Uri '%MINICONDA_DOWNLOAD_URL%' -OutFile 'miniconda.exe'}" + if not exist "miniconda.exe" ( + echo Download failed. Please check your internet connection and try again. + goto :error + ) + + echo Installing Miniconda... + start /wait "" miniconda.exe /InstallationType=JustMe /RegisterPython=0 /S /D=%CONDA_ROOT_PREFIX% + if errorlevel 1 ( + echo Miniconda installation failed. + goto :error + ) + del miniconda.exe + echo Miniconda installation complete. + ) else ( + echo Miniconda already installed. Skipping installation. + ) + echo. + + echo Creating Conda environment... + call "%CONDA_ROOT_PREFIX%\_conda.exe" create --no-shortcuts -y -k --prefix "%INSTALL_ENV_DIR%" python=3.9 + if errorlevel 1 goto :error + echo Conda environment created successfully. + echo. + + if exist "%INSTALL_ENV_DIR%\python.exe" ( + echo Installing specific pip version... + "%INSTALL_ENV_DIR%\python.exe" -m pip install --no-warn-script-location "pip<24.1" + if errorlevel 1 goto :error + echo Pip installation complete. + echo. + ) + + echo Installing dependencies... + "%INSTALL_ENV_DIR%\python.exe" -m pip install --no-warn-script-location --no-deps -r requirements.txt + "%INSTALL_ENV_DIR%\python.exe" -m pip uninstall torch torchvision torchaudio -y + "%INSTALL_ENV_DIR%\python.exe" -m pip install --no-warn-script-location torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121 + if errorlevel 1 goto :error + cls + echo Dependencies installed successfully. + echo. +) + +env\python programs\applio_code\rvc\lib\tools\prerequisites_download.py + +env\python main.py --open +if errorlevel 1 goto :error + +goto :eof + +:error +echo An error occurred. Exiting... +pause +exit /b 1 diff --git a/run.sh b/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..e3ee301e7fa4cb71693d822f59efc79d99b476ee --- /dev/null +++ b/run.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +set -e + +title="hexGen-RVC" +echo $title + +if [ ! -d "env" ]; then + principal=$(pwd) + CONDA_ROOT_PREFIX="$HOME/miniconda3" + INSTALL_ENV_DIR="$principal/env" + MINICONDA_DOWNLOAD_URL="https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Linux-x86_64.sh" + CONDA_EXECUTABLE="$CONDA_ROOT_PREFIX/bin/conda" + + if [ ! -f "$CONDA_EXECUTABLE" ]; then + echo "Miniconda not found. Starting download and installation..." + echo "Downloading Miniconda..." + curl -o miniconda.sh $MINICONDA_DOWNLOAD_URL + if [ ! -f "miniconda.sh" ]; then + echo "Download failed. Please check your internet connection and try again." + exit 1 + fi + + echo "Installing Miniconda..." + bash miniconda.sh -b -p $CONDA_ROOT_PREFIX + if [ $? -ne 0 ]; then + echo "Miniconda installation failed." + exit 1 + fi + rm miniconda.sh + echo "Miniconda installation complete." + else + echo "Miniconda already installed. Skipping installation." + fi + echo + + echo "Creating Conda environment..." + $CONDA_EXECUTABLE create --no-shortcuts -y -k --prefix "$INSTALL_ENV_DIR" python=3.9 + if [ $? -ne 0 ]; then + exit 1 + fi + echo "Conda environment created successfully." + echo + + if [ -f "$INSTALL_ENV_DIR/bin/python" ]; then + echo "Installing specific pip version..." + $INSTALL_ENV_DIR/bin/python -m pip install "pip<24.1" + if [ $? -ne 0 ]; then + exit 1 + fi + echo "Pip installation complete." + echo + fi + + echo "Installing dependencies..." + source "$CONDA_ROOT_PREFIX/etc/profile.d/conda.sh" + conda activate "$INSTALL_ENV_DIR" || exit 1 + pip install --upgrade setuptools || exit 1 + pip install --no-deps -r "$principal/requirements.txt" || exit 1 + pip uninstall torch torchvision torchaudio -y + pip install torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 --index-url https://download.pytorch.org/whl/cu121 || exit 1 + conda deactivate + echo "Dependencies installation complete." + echo +fi + +if [ ! -d "programs/applio_code/rvc/models" ]; then + python programs/applio_code/rvc/lib/tools/prerequisites_download.py + echo +fi + +$INSTALL_ENV_DIR/bin/python main.py --open +echo +read -p "Press any key to continue..." -n1 -s +exit 0 + +error() { + echo "An error occurred during installation. Please check the output above for details." + read -p "Press any key to continue..." -n1 -s + exit 1 +} +trap error ERR diff --git a/tabs/download_model.py b/tabs/download_model.py new file mode 100644 index 0000000000000000000000000000000000000000..b858b550024cd466dd3affa8535380dc9414a5fc --- /dev/null +++ b/tabs/download_model.py @@ -0,0 +1,95 @@ +import gradio as gr +import shutil +import os, sys +import regex as re + +from core import download_model +from programs.applio_code.rvc.lib.utils import format_title +from assets.i18n.i18n import I18nAuto + +now_dir = os.getcwd() +sys.path.append(now_dir) + +i18n = I18nAuto() + + +def save_drop_model(dropbox): + if "pth" not in dropbox and "index" not in dropbox: + raise gr.Error( + message="The file you dropped is not a valid model file. Please try again." + ) + else: + file_name = format_title(os.path.basename(dropbox)) + if ".pth" in dropbox: + model_name = format_title(file_name.split(".pth")[0]) + else: + if ( + "v2" not in dropbox + and "added_" not in dropbox + and "_nprobe_1_" not in dropbox + ): + model_name = format_title(file_name.split(".index")[0]) + else: + if "v2" not in dropbox: + if "_nprobe_1_" in file_name and "_v1" in file_name: + model_name = format_title( + file_name.split("_nprobe_1_")[1].split("_v1")[0] + ) + elif "added_" in file_name and "_v1" in file_name: + model_name = format_title( + file_name.split("added_")[1].split("_v1")[0] + ) + else: + if "_nprobe_1_" in file_name and "_v2" in file_name: + model_name = format_title( + file_name.split("_nprobe_1_")[1].split("_v2")[0] + ) + elif "added_" in file_name and "_v2" in file_name: + model_name = format_title( + file_name.split("added_")[1].split("_v2")[0] + ) + + model_name = re.sub(r"\d+[se]", "", model_name) + if "__" in model_name: + model_name = model_name.replace("__", "") + + model_path = os.path.join(now_dir, "logs", model_name) + if not os.path.exists(model_path): + os.makedirs(model_path) + if os.path.exists(os.path.join(model_path, file_name)): + os.remove(os.path.join(model_path, file_name)) + shutil.copy(dropbox, os.path.join(model_path, file_name)) + print(f"{file_name} saved in {model_path}") + gr.Info(f"{file_name} saved in {model_path}") + return None + + +def download_model_tab(): + with gr.Row(): + link = gr.Textbox( + label=i18n("Model URL"), + lines=1, + ) + output = gr.Textbox( + label=i18n("Output Information"), + info=i18n("The output information will be displayed here."), + ) + download = gr.Button(i18n("Download")) + + download.click( + download_model, + inputs=[link], + outputs=[output], + ) + gr.Markdown(value=i18n("## Drop files")) + dropbox = gr.File( + label=i18n( + "Drag your .pth file and .index file into this space. Drag one and then the other." + ), + type="filepath", + ) + dropbox.upload( + fn=save_drop_model, + inputs=[dropbox], + outputs=[dropbox], + ) diff --git a/tabs/full_inference.py b/tabs/full_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..0e32560d547f642fa9a411d69901d9f38b6d8e53 --- /dev/null +++ b/tabs/full_inference.py @@ -0,0 +1,915 @@ +from core import full_inference_program +import sys, os +import gradio as gr +import regex as re +from assets.i18n.i18n import I18nAuto +import torch +import shutil +import unicodedata +from core import download_music + +i18n = I18nAuto() + +now_dir = os.getcwd() +sys.path.append(now_dir) + +model_root = os.path.join(now_dir, "logs") +audio_root = os.path.join(now_dir, "audio_files", "original_files") + +model_root_relative = os.path.relpath(model_root, now_dir) +audio_root_relative = os.path.relpath(audio_root, now_dir) + +sup_audioext = { + "wav", + "mp3", + "flac", + "ogg", + "opus", + "m4a", + "mp4", + "aac", + "alac", + "wma", + "aiff", + "webm", + "ac3", +} + +names = [ + os.path.join(root, file) + for root, _, files in os.walk(model_root_relative, topdown=False) + for file in files + if ( + file.endswith((".pth", ".onnx")) + and not (file.startswith("G_") or file.startswith("D_")) + ) +] + +indexes_list = [ + os.path.join(root, name) + for root, _, files in os.walk(model_root_relative, topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name +] + +audio_paths = [ + os.path.join(root, name) + for root, _, files in os.walk(audio_root_relative, topdown=False) + for name in files + if name.endswith(tuple(sup_audioext)) + and root == audio_root_relative + and "_output" not in name +] + +vocals_model_names = [ + "Mel-Roformer by KimberleyJSN", + "BS-Roformer by ViperX", + "MDX23C", +] + +karaoke_models_names = [ + "Mel-Roformer Karaoke by aufr33 and viperx", + "UVR-BVE", +] + +denoise_models_names = [ + "Mel-Roformer Denoise Normal by aufr33", + "Mel-Roformer Denoise Aggressive by aufr33", + "UVR Denoise", +] + +dereverb_models_names = [ + "MDX23C DeReverb by aufr33 and jarredou", + "UVR-Deecho-Dereverb", + "MDX Reverb HQ by FoxJoy", + "BS-Roformer Dereverb by anvuew", +] + +deeecho_models_names = ["UVR-Deecho-Normal", "UVR-Deecho-Aggressive"] + + +def get_indexes(): + indexes_list = [ + os.path.join(dirpath, filename) + for dirpath, _, filenames in os.walk(model_root_relative) + for filename in filenames + if filename.endswith(".index") and "trained" not in filename + ] + + return indexes_list if indexes_list else "" + + +def match_index(model_file_value): + if model_file_value: + model_folder = os.path.dirname(model_file_value) + model_name = os.path.basename(model_file_value) + index_files = get_indexes() + pattern = r"^(.*?)_" + match = re.match(pattern, model_name) + for index_file in index_files: + if os.path.dirname(index_file) == model_folder: + return index_file + elif match and match.group(1) in os.path.basename(index_file): + return index_file + elif model_name in os.path.basename(index_file): + return index_file + return "" + + +def output_path_fn(input_audio_path): + original_name_without_extension = os.path.basename(input_audio_path).rsplit(".", 1)[ + 0 + ] + new_name = original_name_without_extension + "_output.wav" + output_path = os.path.join(os.path.dirname(input_audio_path), new_name) + return output_path + + +def get_number_of_gpus(): + if torch.cuda.is_available(): + num_gpus = torch.cuda.device_count() + return "-".join(map(str, range(num_gpus))) + else: + return "-" + + +def max_vram_gpu(gpu): + if torch.cuda.is_available(): + gpu_properties = torch.cuda.get_device_properties(gpu) + total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024) + return total_memory_gb / 2 + else: + return "0" + + +def format_title(title): + formatted_title = ( + unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8") + ) + formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title) + formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title) + formatted_title = re.sub(r"\s+", "_", formatted_title) + return formatted_title + + +def save_to_wav(upload_audio): + file_path = upload_audio + formated_name = format_title(os.path.basename(file_path)) + target_path = os.path.join(audio_root_relative, formated_name) + + if os.path.exists(target_path): + os.remove(target_path) + + os.makedirs(os.path.dirname(target_path), exist_ok=True) + shutil.copy(file_path, target_path) + return target_path, output_path_fn(target_path) + + +def delete_outputs(): + gr.Info(f"Outputs cleared!") + for root, _, files in os.walk(audio_root_relative, topdown=False): + for name in files: + if name.endswith(tuple(sup_audioext)) and name.__contains__("_output"): + os.remove(os.path.join(root, name)) + + +def change_choices(): + names = [ + os.path.join(root, file) + for root, _, files in os.walk(model_root_relative, topdown=False) + for file in files + if ( + file.endswith((".pth", ".onnx")) + and not (file.startswith("G_") or file.startswith("D_")) + ) + ] + + indexes_list = [ + os.path.join(root, name) + for root, _, files in os.walk(model_root_relative, topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + + audio_paths = [ + os.path.join(root, name) + for root, _, files in os.walk(audio_root_relative, topdown=False) + for name in files + if name.endswith(tuple(sup_audioext)) + and root == audio_root_relative + and "_output" not in name + ] + + return ( + {"choices": sorted(names), "__type__": "update"}, + {"choices": sorted(indexes_list), "__type__": "update"}, + {"choices": sorted(audio_paths), "__type__": "update"}, + ) + + +def full_inference_tab(): + default_weight = names[0] if names else None + with gr.Row(): + with gr.Row(): + model_file = gr.Dropdown( + label=i18n("Voice Model"), + info=i18n("Select the voice model to use for the conversion."), + choices=sorted(names, key=lambda path: os.path.getsize(path)), + interactive=True, + value=default_weight, + allow_custom_value=True, + ) + + index_file = gr.Dropdown( + label=i18n("Index File"), + info=i18n("Select the index file to use for the conversion."), + choices=get_indexes(), + value=match_index(default_weight) if default_weight else "", + interactive=True, + allow_custom_value=True, + ) + with gr.Row(): + refresh_button = gr.Button(i18n("Refresh")) + unload_button = gr.Button(i18n("Unload Voice")) + + unload_button.click( + fn=lambda: ( + {"value": "", "__type__": "update"}, + {"value": "", "__type__": "update"}, + ), + inputs=[], + outputs=[model_file, index_file], + ) + model_file.select( + fn=lambda model_file_value: match_index(model_file_value), + inputs=[model_file], + outputs=[index_file], + ) + + with gr.Tab(i18n("Single Inference")): + with gr.Column(): + upload_audio = gr.Audio( + label=i18n("Upload Audio"), + type="filepath", + editable=False, + sources="upload", + ) + with gr.Row(): + audio = gr.Dropdown( + label=i18n("Select Audio"), + info=i18n("Select the audio to convert."), + choices=sorted(audio_paths), + value=audio_paths[0] if audio_paths else "", + interactive=True, + allow_custom_value=True, + ) + with gr.Accordion(i18n("Advanced Settings"), open=False): + with gr.Accordion(i18n("RVC Settings"), open=False): + output_path = gr.Textbox( + label=i18n("Output Path"), + placeholder=i18n("Enter output path"), + info=i18n( + "The path where the output audio will be saved, by default in audio_files/rvc/output.wav" + ), + value=os.path.join(now_dir, "audio_files", "rvc"), + interactive=False, + visible=False, + ) + infer_backing_vocals = gr.Checkbox( + label=i18n("Infer Backing Vocals"), + info=i18n("Infer the bakcing vocals too."), + visible=True, + value=False, + interactive=True, + ) + with gr.Row(): + infer_backing_vocals_model = gr.Dropdown( + label=i18n("Backing Vocals Model"), + info=i18n( + "Select the backing vocals model to use for the conversion." + ), + choices=sorted(names, key=lambda path: os.path.getsize(path)), + interactive=True, + value=default_weight, + visible=False, + allow_custom_value=False, + ) + infer_backing_vocals_index = gr.Dropdown( + label=i18n("Backing Vocals Index File"), + info=i18n( + "Select the backing vocals index file to use for the conversion." + ), + choices=get_indexes(), + value=match_index(default_weight) if default_weight else "", + interactive=True, + visible=False, + allow_custom_value=True, + ) + with gr.Column(): + refresh_button_infer_backing_vocals = gr.Button( + i18n("Refresh"), + visible=False, + ) + unload_button_infer_backing_vocals = gr.Button( + i18n("Unload Voice"), + visible=False, + ) + + unload_button_infer_backing_vocals.click( + fn=lambda: ( + {"value": "", "__type__": "update"}, + {"value": "", "__type__": "update"}, + ), + inputs=[], + outputs=[ + infer_backing_vocals_model, + infer_backing_vocals_index, + ], + ) + infer_backing_vocals_model.select( + fn=lambda model_file_value: match_index(model_file_value), + inputs=[infer_backing_vocals_model], + outputs=[infer_backing_vocals_index], + ) + with gr.Accordion( + i18n("RVC Settings for Backing vocals"), open=False, visible=False + ) as back_rvc_settings: + export_format_rvc_back = gr.Radio( + label=i18n("Export Format"), + info=i18n("Select the format to export the audio."), + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + value="WAV", + interactive=True, + visible=False, + ) + split_audio_back = gr.Checkbox( + label=i18n("Split Audio"), + info=i18n( + "Split the audio into chunks for inference to obtain better results in some cases." + ), + visible=True, + value=False, + interactive=True, + ) + pitch_extract_back = gr.Radio( + label=i18n("Pitch Extractor"), + info=i18n("Pitch extract Algorith."), + choices=[ + "rmvpe", + "crepe", + "crepe-tiny", + "fcpe", + "hybrid[fcpe+rmvpe]", + ], + value="rmvpe", + interactive=True, + ) + hop_length_back = gr.Slider( + label=i18n("Hop Length"), + info=i18n("Hop length for pitch extraction."), + minimum=1, + maximum=512, + step=1, + value=64, + visible=False, + ) + embedder_model_back = gr.Radio( + label=i18n("Embedder Model"), + info=i18n("Model used for learning speaker embedding."), + choices=[ + "contentvec", + "chinese-hubert-base", + "japanese-hubert-base", + "korean-hubert-base", + ], + value="contentvec", + interactive=True, + ) + autotune_back = gr.Checkbox( + label=i18n("Autotune"), + info=i18n( + "Apply a soft autotune to your inferences, recommended for singing conversions." + ), + visible=True, + value=False, + interactive=True, + ) + pitch_back = gr.Slider( + label=i18n("Pitch"), + info=i18n("Adjust the pitch of the audio."), + minimum=-12, + maximum=12, + step=1, + value=0, + interactive=True, + ) + filter_radius_back = gr.Slider( + minimum=0, + maximum=7, + label=i18n("Filter Radius"), + info=i18n( + "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration." + ), + value=3, + step=1, + interactive=True, + ) + index_rate_back = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Search Feature Ratio"), + info=i18n( + "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio." + ), + value=0.75, + interactive=True, + ) + rms_mix_rate_back = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Volume Envelope"), + info=i18n( + "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed." + ), + value=0.25, + interactive=True, + ) + protect_back = gr.Slider( + minimum=0, + maximum=0.5, + label=i18n("Protect Voiceless Consonants"), + info=i18n( + "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect." + ), + value=0.33, + interactive=True, + ) + clear_outputs_infer = gr.Button( + i18n("Clear Outputs (Deletes all audios in assets/audios)") + ) + export_format_rvc = gr.Radio( + label=i18n("Export Format"), + info=i18n("Select the format to export the audio."), + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + value="WAV", + interactive=True, + visible=False, + ) + split_audio = gr.Checkbox( + label=i18n("Split Audio"), + info=i18n( + "Split the audio into chunks for inference to obtain better results in some cases." + ), + visible=True, + value=False, + interactive=True, + ) + pitch_extract = gr.Radio( + label=i18n("Pitch Extractor"), + info=i18n("Pitch extract Algorith."), + choices=[ + "rmvpe", + "crepe", + "crepe-tiny", + "fcpe", + "hybrid[fcpe+rmvpe]", + ], + value="rmvpe", + interactive=True, + ) + hop_length = gr.Slider( + label=i18n("Hop Length"), + info=i18n("Hop length for pitch extraction."), + minimum=1, + maximum=512, + step=1, + value=64, + visible=False, + ) + embedder_model = gr.Radio( + label=i18n("Embedder Model"), + info=i18n("Model used for learning speaker embedding."), + choices=[ + "contentvec", + "chinese-hubert-base", + "japanese-hubert-base", + "korean-hubert-base", + ], + value="contentvec", + interactive=False, + ) + autotune = gr.Checkbox( + label=i18n("Autotune"), + info=i18n( + "Apply a soft autotune to your inferences, recommended for singing conversions." + ), + visible=True, + value=False, + interactive=True, + ) + pitch = gr.Slider( + label=i18n("Pitch"), + info=i18n("Adjust the pitch of the audio."), + minimum=-12, + maximum=12, + step=1, + value=0, + interactive=True, + ) + filter_radius = gr.Slider( + minimum=0, + maximum=7, + label=i18n("Filter Radius"), + info=i18n( + "If the number is greater than or equal to three, employing median filtering on the collected tone results has the potential to decrease respiration." + ), + value=3, + step=1, + interactive=True, + ) + index_rate = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Search Feature Ratio"), + info=i18n( + "Influence exerted by the index file; a higher value corresponds to greater influence. However, opting for lower values can help mitigate artifacts present in the audio." + ), + value=0.75, + interactive=True, + ) + rms_mix_rate = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Volume Envelope"), + info=i18n( + "Substitute or blend with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is employed." + ), + value=0.25, + interactive=True, + ) + protect = gr.Slider( + minimum=0, + maximum=0.5, + label=i18n("Protect Voiceless Consonants"), + info=i18n( + "Safeguard distinct consonants and breathing sounds to prevent electro-acoustic tearing and other artifacts. Pulling the parameter to its maximum value of 0.5 offers comprehensive protection. However, reducing this value might decrease the extent of protection while potentially mitigating the indexing effect." + ), + value=0.33, + interactive=True, + ) + with gr.Accordion(i18n("Audio Separation Settings"), open=False): + use_tta = gr.Checkbox( + label=i18n("Use TTA"), + info=i18n("Use Test Time Augmentation."), + visible=True, + value=False, + interactive=True, + ) + batch_size = gr.Slider( + minimum=1, + maximum=24, + step=1, + label=i18n("Batch Size"), + info=i18n("Set the batch size for the separation."), + value=1, + interactive=True, + ) + vocal_model = gr.Dropdown( + label=i18n("Vocals Model"), + info=i18n("Select the vocals model to use for the separation."), + choices=sorted(vocals_model_names), + interactive=True, + value="Mel-Roformer by KimberleyJSN", + allow_custom_value=False, + ) + karaoke_model = gr.Dropdown( + label=i18n("Karaoke Model"), + info=i18n("Select the karaoke model to use for the separation."), + choices=sorted(karaoke_models_names), + interactive=True, + value="Mel-Roformer Karaoke by aufr33 and viperx", + allow_custom_value=False, + ) + dereverb_model = gr.Dropdown( + label=i18n("Dereverb Model"), + info=i18n("Select the dereverb model to use for the separation."), + choices=sorted(dereverb_models_names), + interactive=True, + value="UVR-Deecho-Dereverb", + allow_custom_value=False, + ) + deecho = gr.Checkbox( + label=i18n("Deeecho"), + info=i18n("Apply deeecho to the audio."), + visible=True, + value=True, + interactive=True, + ) + deeecho_model = gr.Dropdown( + label=i18n("Deeecho Model"), + info=i18n("Select the deeecho model to use for the separation."), + choices=sorted(deeecho_models_names), + interactive=True, + value="UVR-Deecho-Normal", + allow_custom_value=False, + ) + denoise = gr.Checkbox( + label=i18n("Denoise"), + info=i18n("Apply denoise to the audio."), + visible=True, + value=False, + interactive=True, + ) + denoise_model = gr.Dropdown( + label=i18n("Denoise Model"), + info=i18n("Select the denoise model to use for the separation."), + choices=sorted(denoise_models_names), + interactive=True, + value="Mel-Roformer Denoise Normal by aufr33", + allow_custom_value=False, + visible=False, + ) + with gr.Accordion(i18n("Audio post-process Settings"), open=False): + change_inst_pitch = gr.Slider( + label=i18n("Change Instrumental Pitch"), + info=i18n("Change the pitch of the instrumental."), + minimum=-12, + maximum=12, + step=1, + value=0, + interactive=True, + ) + delete_audios = gr.Checkbox( + label=i18n("Delete Audios"), + info=i18n("Delete the audios after the conversion."), + visible=True, + value=True, + interactive=True, + ) + reverb = gr.Checkbox( + label=i18n("Reverb"), + info=i18n("Apply reverb to the audio."), + visible=True, + value=False, + interactive=True, + ) + reverb_room_size = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Reverb Room Size"), + info=i18n("Set the room size of the reverb."), + value=0.5, + interactive=True, + visible=False, + ) + + reverb_damping = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Reverb Damping"), + info=i18n("Set the damping of the reverb."), + value=0.5, + interactive=True, + visible=False, + ) + + reverb_wet_gain = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Reverb Wet Gain"), + info=i18n("Set the wet gain of the reverb."), + value=0.33, + interactive=True, + visible=False, + ) + + reverb_dry_gain = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Reverb Dry Gain"), + info=i18n("Set the dry gain of the reverb."), + value=0.4, + interactive=True, + visible=False, + ) + + reverb_width = gr.Slider( + minimum=0, + maximum=1, + label=i18n("Reverb Width"), + info=i18n("Set the width of the reverb."), + value=1.0, + interactive=True, + visible=False, + ) + vocals_volume = gr.Slider( + label=i18n("Vocals Volume"), + info=i18n("Adjust the volume of the vocals."), + minimum=-10, + maximum=0, + step=1, + value=-3, + interactive=True, + ) + instrumentals_volume = gr.Slider( + label=i18n("Instrumentals Volume"), + info=i18n("Adjust the volume of the Instrumentals."), + minimum=-10, + maximum=0, + step=1, + value=-3, + interactive=True, + ) + backing_vocals_volume = gr.Slider( + label=i18n("Backing Vocals Volume"), + info=i18n("Adjust the volume of the backing vocals."), + minimum=-10, + maximum=0, + step=1, + value=-3, + interactive=True, + ) + export_format_final = gr.Radio( + label=i18n("Export Format"), + info=i18n("Select the format to export the audio."), + choices=["WAV", "MP3", "FLAC", "OGG", "M4A"], + value="WAV", + interactive=True, + ) + with gr.Accordion(i18n("Device Settings"), open=False): + devices = gr.Textbox( + label=i18n("Device"), + info=i18n( + "Select the device to use for the conversion. 0 to ∞ separated by - and for CPU leave only an -" + ), + value=get_number_of_gpus(), + interactive=True, + ) + + with gr.Row(): + vc_output1 = gr.Textbox( + label=i18n("Output Information"), + info=i18n("The output information will be displayed here."), + ) + vc_output2 = gr.Audio(label=i18n("Export Audio")) + + with gr.Row(): + convert_button = gr.Button(i18n("Convert")) + + with gr.Tab(i18n("Download Music")): + with gr.Row(): + link = gr.Textbox(label=i18n("Music URL"), lines=1) + output = gr.Textbox( + label=i18n("Output Information"), + info=i18n("The output information will be displayed here."), + ) + download = gr.Button(i18n("Download")) + download.click( + download_music, + inputs=[link], + outputs=[output], + ) + + def update_dropdown_visibility(checkbox): + return gr.update(visible=checkbox) + + def update_reverb_sliders_visibility(reverb_checked): + return { + reverb_room_size: gr.update(visible=reverb_checked), + reverb_damping: gr.update(visible=reverb_checked), + reverb_wet_gain: gr.update(visible=reverb_checked), + reverb_dry_gain: gr.update(visible=reverb_checked), + reverb_width: gr.update(visible=reverb_checked), + } + + def update_visibility_infer_backing(infer_backing_vocals): + visible = infer_backing_vocals + return ( + {"visible": visible, "__type__": "update"}, + {"visible": visible, "__type__": "update"}, + {"visible": visible, "__type__": "update"}, + {"visible": visible, "__type__": "update"}, + {"visible": visible, "__type__": "update"}, + ) + + def update_hop_length_visibility(pitch_extract_value): + return gr.update(visible=pitch_extract_value in ["crepe", "crepe-tiny"]) + + refresh_button.click( + fn=change_choices, + inputs=[], + outputs=[model_file, index_file, audio], + ) + refresh_button_infer_backing_vocals.click( + fn=change_choices, + inputs=[], + outputs=[infer_backing_vocals_model, infer_backing_vocals_index], + ) + upload_audio.upload( + fn=save_to_wav, + inputs=[upload_audio], + outputs=[audio, output_path], + ) + clear_outputs_infer.click( + fn=delete_outputs, + inputs=[], + outputs=[], + ) + convert_button.click( + full_inference_program, + inputs=[ + model_file, + index_file, + audio, + output_path, + export_format_rvc, + split_audio, + autotune, + vocal_model, + karaoke_model, + dereverb_model, + deecho, + deeecho_model, + denoise, + denoise_model, + reverb, + vocals_volume, + instrumentals_volume, + backing_vocals_volume, + export_format_final, + devices, + pitch, + filter_radius, + index_rate, + rms_mix_rate, + protect, + pitch_extract, + hop_length, + reverb_room_size, + reverb_damping, + reverb_wet_gain, + reverb_dry_gain, + reverb_width, + embedder_model, + delete_audios, + use_tta, + batch_size, + infer_backing_vocals, + infer_backing_vocals_model, + infer_backing_vocals_index, + change_inst_pitch, + pitch_back, + filter_radius_back, + index_rate_back, + rms_mix_rate_back, + protect_back, + pitch_extract_back, + hop_length_back, + export_format_rvc_back, + split_audio_back, + autotune_back, + embedder_model_back, + ], + outputs=[vc_output1, vc_output2], + ) + + deecho.change( + fn=update_dropdown_visibility, + inputs=deecho, + outputs=deeecho_model, + ) + + denoise.change( + fn=update_dropdown_visibility, + inputs=denoise, + outputs=denoise_model, + ) + + reverb.change( + fn=update_reverb_sliders_visibility, + inputs=reverb, + outputs=[ + reverb_room_size, + reverb_damping, + reverb_wet_gain, + reverb_dry_gain, + reverb_width, + ], + ) + pitch_extract.change( + fn=update_hop_length_visibility, + inputs=pitch_extract, + outputs=hop_length, + ) + + infer_backing_vocals.change( + fn=update_visibility_infer_backing, + inputs=[infer_backing_vocals], + outputs=[ + infer_backing_vocals_model, + infer_backing_vocals_index, + refresh_button_infer_backing_vocals, + unload_button_infer_backing_vocals, + back_rvc_settings, + ], + ) diff --git a/update.bat b/update.bat new file mode 100644 index 0000000000000000000000000000000000000000..21a2593ba26f774a6fad204685b419000a2eccf6 --- /dev/null +++ b/update.bat @@ -0,0 +1,61 @@ +@echo off +setlocal + +REM Define the repository URL +set REPO_URL=https://github.com/ShiromiyaG/RVC-AI-Cover-Maker-UI + +REM Navigate to the directory where the script is located +cd /d %~dp0 + +REM Loop through all directories except "env", "logs", "audio_files", and "programs/applio_code/rvc/models" +for /d %%D in (*) do ( + if /i not "%%D"=="env" if /i not "%%D"=="logs" if /i not "%%D"=="audio_files" if /i not "%%D"=="models" if /i not "%%D"=="programs" ( + echo Deleting directory %%D + rmdir /s /q "%%D" + ) +) + +REM Loop through all subdirectories in "programs" except "applio_code/rvc/models" +for /d %%D in (programs\*) do ( + if /i not "%%D"=="programs\applio_code" ( + echo Deleting directory %%D + rmdir /s /q "%%D" + ) +) + +for /d %%D in (programs\applio_code\*) do ( + if /i not "%%D"=="programs\applio_code\rvc" ( + echo Deleting directory %%D + rmdir /s /q "%%D" + ) +) + +for /d %%D in (programs\applio_code\rvc\*) do ( + if /i not "%%D"=="programs\applio_code\rvc\models" ( + echo Deleting directory %%D + rmdir /s /q "%%D" + ) +) + +REM Loop through all files and delete them +for %%F in (*) do ( + if not "%%F"=="update.bat" ( + echo Deleting file %%F + del /q "%%F" + ) +) + +REM Initialize a new git repository if it doesn't exist +if not exist .git ( + git init + git remote add origin %REPO_URL% +) + +REM Fetch the latest changes from the repository +git fetch origin + +REM Reset the working directory to match the latest commit +git reset --hard origin/main + +pause +endlocal