ultimate-rvc

Runtime error

App Files Files Community

Blane187 commited on Sep 9, 2024

Commit

c8be32d

verified ·

1 Parent(s): 91f5864

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -35
.gitignore +166 -0
LICENSE +21 -0
README.md +184 -13
images/webui_dl_model.png +0 -0
images/webui_generate.png +0 -0
images/webui_upload_model.png +0 -0
models/rvc/MODELS.txt +2 -0
models/rvc/public_models.json +626 -0
notebooks/ultimate_rvc_colab.ipynb +134 -0
pyproject.toml +105 -0
requirements.txt +55 -0
src/app.py +214 -0
src/backend/common.py +259 -0
src/backend/exceptions.py +43 -0
src/backend/generate_song_cover.py +1679 -0
src/backend/manage_audio.py +225 -0
src/backend/manage_voice_models.py +426 -0
src/cli.py +219 -0
src/common.py +10 -0
src/frontend/common.py +466 -0
src/frontend/tabs/manage_audio.py +216 -0
src/frontend/tabs/manage_models.py +302 -0
src/frontend/tabs/multi_step_generation.py +991 -0
src/frontend/tabs/one_click_generation.py +573 -0
src/init.py +41 -0
src/typings/audio_separator/separator/__init__.pyi +78 -0
src/typings/extra.py +71 -0
src/typings/gradio/__init__.pyi +238 -0
src/typings/gradio/events.pyi +374 -0
src/typings/pedalboard_native/io/__init__.pyi +39 -0
src/typings/soundfile/__init__.pyi +34 -0
src/typings/sox/__init__.pyi +15 -0
src/typings/yt_dlp/__init__.pyi +25 -0
src/vc/configs/32k.json +46 -0
src/vc/configs/32k_v2.json +46 -0
src/vc/configs/40k.json +46 -0
src/vc/configs/48k.json +46 -0
src/vc/configs/48k_v2.json +46 -0
src/vc/infer_pack/attentions.py +417 -0
src/vc/infer_pack/commons.py +166 -0
src/vc/infer_pack/models.py +1128 -0
src/vc/infer_pack/models_onnx.py +822 -0
src/vc/infer_pack/models_onnx_moess.py +853 -0
src/vc/infer_pack/modules.py +522 -0
src/vc/infer_pack/transforms.py +209 -0
src/vc/my_utils.py +21 -0
src/vc/rmvpe.py +409 -0
src/vc/rvc.py +205 -0
src/vc/trainset_preprocess_pipeline_print.py +146 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+# General
+dependencies
+audio
+# Audio separation models
+models/audio_separator
+# RVC Models
+models/rvc/*/*.pth
+models/rvc/*/*.index
+models/rvc/*/*.npy
+models/rvc/hubert_base.pt
+models/rvc/rmvpe.pt
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 SociallyIneptWeeb
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,184 @@
----
-title: Ultimate Rvc
-emoji: 🔥
-colorFrom: purple
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.43.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Ultimate RVC
+An extension of [AiCoverGen](https://github.com/SociallyIneptWeeb/AICoverGen), which provides several new features and improvements, enabling users to generate song covers using RVC with ease. Ideal for people who want to incorporate singing functionality into their AI assistant/chatbot/vtuber, or for people who want to hear their favourite characters sing their favourite song.
+<!-- Showcase: TBA -->
+![](images/webui_generate.png?raw=true)
+Ultimate RVC is under constant development and testing, but you can try it out right now locally or on Google Colab!
+## New Features
+* Easy and automated setup using launcher scripts for both windows and debian-based linux systems
+* Caching system which saves intermediate audio files as needed, thereby reducing inference time as much as possible. For example, if song A has already been converted using model B and now you want to convert song A using model C, then vocal extraction can be skipped and inference time reduced drastically
+* Ability to listen to intermediate audio files in the UI. This is useful for getting an idea of what is happening in each step of the song cover generation pipeline
+* A "multi-step" song cover generation tab: here you can try out each step of the song cover generation pipeline in isolation. For example, if you already have extracted vocals available and only want to convert these using your voice model, then you can do that here. Besides, this tab is useful for experimenting with settings for each step of the song cover generation pipeline
+* An overhaul of the song input component for the song cover generation pipeline. Now cached input songs can be selected from a dropdown, so that you don't have to supply the Youtube link of a song each time you want to convert it.
+* A new "manage models" tab, which collects and revamps all existing functionality for managing voice models, as well as adds some new features, such as the ability to delete existing models
+* A "manage audio", which allows you to to interact with all audio generated by the app. Currently, this tab supports deleting audio files.
+* Lots of visual and performance improvements resulting from updating from Gradio 3 to Gradio 4 and from python 3.9 to python 3.11
+<!-- ## Changelog
+TBA -->
+#### PRO TIP: Use a GPU for faster processing
+While it is possible to run the Ultimate RVC web app on a CPU, it is highly recommended to use a GPU for faster processing. On an NVIDIA 3080 GPU, the AI cover generation process takes approximately 1.5 minutes, while on a CPU, it takes approximately 15 minutes. No testing has been done on AMD GPUs, so no guarantees are made for their performance.
+## Colab notebook
+For those without a powerful enough NVIDIA GPU, you may try Ultimate RVC out using Google Colab.
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JackismyShephard/ultimate-rvc/blob/main/notebooks/ultimate_rvc_colab.ipynb)
+For those who want to run this locally, follow the setup guide below.
+## Setup
+### Install Git
+Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer.
+### Clone Ultimate RVC repository
+Open a terminal and run the following commands to clone this entire repository and open it locally.
+```
+git clone https://github.com/JackismyShephard/ultimate-rvc
+cd ultimate-rvc
+```
+### Install dependencies
+#### Windows
+Run the following command to install the necessary dependencies on Windows:
+```
+./urvc.bat install
+```
+Note that this will install Miniconda in your user directory.
+The whole process may take upwards of 10 minutes, so grab a cup of coffee and wait.
+#### Linux (Debian-based)
+Run the following command to install the necessary dependencies on Debian-based Linux distributions (e.g. Ubuntu):
+```
+./urvc.sh install
+```
+The command has been tested only on Ubuntu 22.04 and 24.04 so support for other distributions is not guaranteed.
+Also note that the command will install the CUDA 12.1 toolkit system-wide. In case you have problems, you may need to install the toolkit manually.
+## Usage
+### Start the app
+#### Windows
+```
+./urvc.bat run
+```
+#### Linux (Debian-based)
+```
+./urvc.sh run
+```
+Once the following output message `Running on local URL:  http://127.0.0.1:7860` appears, you can click on the link to open a tab with the web app.
+### Manage models
+#### Download models
+![](images/webui_dl_model.png?raw=true)
+Navigate to the `Download model` subtab under the `Manage models` tab, and paste the download link to an RVC model and give it a unique name.
+You may search the [AI Hub Discord](https://discord.gg/aihub) where already trained voice models are available for download.
+The downloaded zip file should contain the .pth model file and an optional .index file.
+Once the 2 input fields are filled in, simply click `Download`! Once the output message says `[NAME] Model successfully downloaded!`, you should be able to use it in the `Generate song covers` tab!
+#### Upload models
+![](images/webui_upload_model.png?raw=true)
+For people who have trained RVC v2 models locally and would like to use them for AI cover generations.
+Navigate to the `Upload model` subtab under the `Manage models` tab, and follow the instructions.
+Once the output message says `Model with name [NAME] successfully uploaded!`, you should be able to use it in the `Generate song covers` tab!
+#### Delete RVC models
+TBA
+### Generate song covers
+#### One-click generation
+![](images/webui_generate.png?raw=true)
+- From the Voice model dropdown menu, select the voice model to use.
+- In the song input field, copy and paste the link to any song on YouTube, the full path to a local audio file, or select a cached input song.
+- Pitch should be set to either -12, 0, or 12 depending on the original vocals and the RVC AI modal. This ensures the voice is not *out of tune*.
+- Other advanced options for vocal conversion, audio mixing and etc. can be viewed by clicking the appropriate accordion arrow to expand.
+Once all options are filled in, click `Generate` and the AI generated cover should appear in a less than a few minutes depending on your GPU.
+#### Multi-step generation
+TBA
+<!-- ## CLI
+TBA -->
+## Update to latest version
+Run the following command to pull latest changes from the repository and reinstall dependencies.
+Note that the process may take upwards of 5 minutes.
+#### Windows
+```
+./urvc.bat update
+```
+#### Linux (Debian-based)
+```
+./urvc.sh update
+```
+## Development mode
+When developing new features or debugging, it is recommended to run the app in development mode. This enables hot reloading, which means that the app will automatically reload when changes are made to the code.
+#### Windows
+```
+./urvc.bat dev
+```
+#### Linux (Debian-based)
+```
+./urvc.sh dev
+```
+## Terms of Use
+The use of the converted voice for the following purposes is prohibited.
+* Criticizing or attacking individuals.
+* Advocating for or opposing specific political positions, religions, or ideologies.
+* Publicly displaying strongly stimulating expressions without proper zoning.
+* Selling of voice models and generated voice clips.
+* Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.
+* Fraudulent purposes that lead to identity theft or fraudulent phone calls.
+## Disclaimer
+I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.

images/webui_dl_model.png ADDED Viewed

images/webui_generate.png ADDED Viewed

images/webui_upload_model.png ADDED Viewed

models/rvc/MODELS.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ RVC Models can be added as a folder here. Each folder should contain the model file (.pth extension), and an index file (.index extension).
2	+ For example, a folder called Maya, containing 2 files, Maya.pth and added_IVF1905_Flat_nprobe_Maya_v2.index.

models/rvc/public_models.json ADDED Viewed

	@@ -0,0 +1,626 @@

+{
+    "tags": {
+        "English": "Character speaks English",
+        "Japanese": "Character speaks Japanese",
+        "Other Language": "The character speaks Other Language",
+        "Anime": "Character from anime",
+        "Vtuber": "Character is a vtuber",
+        "Real person": "A person who exists in the real world",
+        "Game character": "A character from the game"
+    },
+    "voice_models": [
+        {
+            "name": "Emilia",
+            "url": "https://huggingface.co/RinkaEmina/RVC_Sharing/resolve/main/Emilia%20V2%2048000.zip",
+            "description": "Emilia from Re:Zero",
+            "added": "2023-07-31",
+            "credit": "rinka4759",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Klee",
+            "url": "https://huggingface.co/qweshkka/Klee/resolve/main/Klee.zip",
+            "description": "Klee from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "qweshsmashjuicefruity",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Yelan",
+            "url": "https://huggingface.co/iroaK/RVC2_Yelan_GenshinImpact/resolve/main/YelanJP.zip",
+            "description": "Yelan from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Yae Miko",
+            "url": "https://huggingface.co/iroaK/RVC2_YaeMiko_GenshinImpact/resolve/main/Yae_MikoJP.zip",
+            "description": "Yae Miko from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Lisa",
+            "url": "https://huggingface.co/qweshkka/Lisa2ver/resolve/main/Lisa.zip",
+            "description": "Lisa from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "qweshsmashjuicefruity",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Kazuha",
+            "url": "https://huggingface.co/iroaK/RVC2_Kazuha_GenshinImpact/resolve/main/Kazuha.zip",
+            "description": "Kaedehara Kazuha from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Barbara",
+            "url": "https://huggingface.co/iroaK/RVC2_Barbara_GenshinImpact/resolve/main/BarbaraJP.zip",
+            "description": "Barbara from Genshin Impact",
+            "added": "2023-07-31",
+            "credit": "iroak",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Tom Holland",
+            "url": "https://huggingface.co/TJKAI/TomHolland/resolve/main/TomHolland.zip",
+            "description": "Tom Holland (Spider-Man)",
+            "added": "2023-08-03",
+            "credit": "tjkcreative",
+            "tags": [
+                "Real person",
+                "English"
+            ]
+        },
+        {
+            "name": "Kamisato Ayaka",
+            "url": "https://huggingface.co/benitheworld/ayaka-cn/resolve/main/ayaka-cn.zip",
+            "description": "Kamisato Ayaka from Genshin Impact - CN voice actor",
+            "added": "2023-08-03",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Amai Odayaka",
+            "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Amai-Odayaka.zip",
+            "description": "Amai Odayaka from Yandere Simulator",
+            "added": "2023-08-03",
+            "credit": "minecraftian47",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "Compa - Hyperdimension Neptunia",
+            "url": "https://huggingface.co/zeerowiibu/WiibuRVCCollection/resolve/main/Compa%20(Choujigen%20Game%20Neptunia)%20(JPN)%20(RVC%20v2)%20(150%20Epochs).zip",
+            "description": "Compa from Choujigen Game Neptune (aka Hyperdimension Neptunia)",
+            "added": "2023-08-03",
+            "credit": "zeerowiibu",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Fu Xuan",
+            "url": "https://huggingface.co/Juneuarie/FuXuan/resolve/main/FuXuan.zip",
+            "description": "Fu Xuan from Honkai Star Rail (HSR)",
+            "added": "2023-08-03",
+            "credit": "__june",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Xinyan",
+            "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/XinyanRVC.zip",
+            "description": "Xinyan from Genshin Impact",
+            "added": "2023-08-03",
+            "credit": "shyelijah",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Enterprise",
+            "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Enterprise-JP.zip",
+            "description": "Enterprise from Azur Lane",
+            "added": "2023-08-03",
+            "credit": "minecraftian47",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kurt Cobain",
+            "url": "https://huggingface.co/Florstie/Kurt_Cobain_byFlorst/resolve/main/Kurt_Florst.zip",
+            "description": "singer Kurt Cobain",
+            "added": "2023-08-03",
+            "credit": "florst",
+            "tags": [
+                "Real person",
+                "English"
+            ]
+        },
+        {
+            "name": "Ironmouse",
+            "url": "https://huggingface.co/Tempo-Hawk/IronmouseV2/resolve/main/IronmouseV2.zip",
+            "description": "Ironmouse",
+            "added": "2023-08-03",
+            "credit": "ladyimpa",
+            "tags": [
+                "Vtuber",
+                "English"
+            ]
+        },
+        {
+            "name": "Bratishkinoff",
+            "url": "https://huggingface.co/JHmashups/Bratishkinoff/resolve/main/bratishkin.zip",
+            "description": "Bratishkinoff (Bratishkin | Братишкин) - russian steamer ",
+            "added": "2023-08-03",
+            "credit": ".caddii",
+            "tags": [
+                "Real person",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Yagami Light",
+            "url": "https://huggingface.co/geekdom-tr/Yagami-Light/resolve/main/Yagami-Light.zip",
+            "description": "Yagami Light (Miyano Mamoru) from death note",
+            "added": "2023-08-03",
+            "credit": "takka / takka#7700",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Itashi",
+            "url": "https://huggingface.co/4uGGun/4uGGunRVC/resolve/main/itashi.zip",
+            "description": "Itashi (Russian fandubber AniLibria) ",
+            "added": "2023-08-03",
+            "credit": "BelochkaOff",
+            "tags": [
+                "Anime",
+                "Other Language",
+                "Real person"
+            ]
+        },
+        {
+            "name": "Michiru Kagemori",
+            "url": "https://huggingface.co/WolfMK/MichiruKagemori/resolve/main/MichiruKagemori_RVC_V2.zip",
+            "description": "Michiru Kagemori from Brand New Animal (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "wolfmk",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        }
+        ,
+        {
+            "name": "Kaeya",
+            "url": "https://huggingface.co/nlordqting4444/nlordqtingRVC/resolve/main/Kaeya.zip",
+            "description": "Kaeya (VA: Kohsuke Toriumi) from Genshin Impact (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nlordqting4444",
+            "tags": [
+                "Game character",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Mona Megistus",
+            "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/MonaRVC.zip",
+            "description": "Mona Megistus (VA: Felecia Angelle) from Genshin Impact (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "shyelijah",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Klee",
+            "url": "https://huggingface.co/hardbop/AI_MODEL_THINGY/resolve/main/kleeeng_rvc.zip",
+            "description": "Klee from Genshin Impact (400 Epochs)",
+            "added": "2023-08-03",
+            "credit": "hardbop",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Sakurakoji Kinako",
+            "url": "https://huggingface.co/Gorodogi/RVC2MangioCrepe/resolve/main/kinakobetatwo700.zip",
+            "description": "Sakurakoji Kinako (Suzuhara Nozomi) from Love Live! Superstar!! (700 Epoch)",
+            "added": "2023-08-03",
+            "credit": "ck1089",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Minamo Kurosawa",
+            "url": "https://huggingface.co/timothy10583/RVC/resolve/main/minamo-kurosawa.zip",
+            "description": "Minamo (Nyamo) Kurosawa (Azumanga Daioh US DUB) (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "timothy10583",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Neco Arc",
+            "url": "https://huggingface.co/Ozzy-Helix/Neko_Arc_Neko_Aruku.RVCv2/resolve/main/Neko_Arc-V3-E600.zip",
+            "description": "Neco Arc (Neco-Aruku) (Epochs 600)",
+            "added": "2023-08-03",
+            "credit": "ozzy_helix_",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Makima",
+            "url": "https://huggingface.co/andolei/makimaen/resolve/main/makima-en-dub.zip",
+            "description": "Makima from Chainsaw Man (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "andpproximately",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "PomPom",
+            "url": "https://huggingface.co/benitheworld/pom-pom/resolve/main/pom-pom.zip",
+            "description": "PomPom from Honkai Star Rail (HSR) (200 Epochs)",
+            "added": "2023-08-03",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Asuka Langley Soryu",
+            "url": "https://huggingface.co/Piegirl/asukaadv/resolve/main/asuka.zip",
+            "description": "Asuka Langley Soryu/Tiffany Grant from Neon Genesis Evangelion (400 Epochs)",
+            "added": "2023-08-03",
+            "credit": "piegirl",
+            "tags": [
+                "Anime",
+                "English"
+            ]
+        },
+        {
+            "name": "Ochaco Uraraka",
+            "url": "https://huggingface.co/legitdark/JP-Uraraka-By-Dan/resolve/main/JP-Uraraka-By-Dan.zip",
+            "description": "Ochaco Uraraka from Boku no Hero Academia (320 Epochs)",
+            "added": "2023-08-03",
+            "credit": "danthevegetable",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Sunaokami Shiroko",
+            "url": "https://huggingface.co/LordDavis778/BlueArchivevoicemodels/resolve/main/SunaokamiShiroko.zip",
+            "description": "Sunaokami Shiroko from Blue Archive (500 Epochs)",
+            "added": "2023-08-03",
+            "credit": "lorddavis778",
+            "tags": [
+                "Anime"
+            ]
+        },
+        {
+            "name": "Dainsleif",
+            "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Dainsleif/Dainsleif.zip",
+            "description": "Dainsleif from Genshin Impact (335 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nasley",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Mae Asmr",
+            "url": "https://huggingface.co/ctian/VRC/resolve/main/MaeASMR.zip",
+            "description": "Mae Asmr - harvest mommy voice (YOUTUBE) (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "ctian_04",
+            "tags": [
+                "English",
+                "Real person",
+                "Vtuber"
+            ]
+        },
+        {
+            "name": "Hana Shirosaki ",
+            "url": "https://huggingface.co/Pawlik17/HanaWataten/resolve/main/HanaWATATEN.zip",
+            "description": "Hana Shirosaki / 白 咲 花 From Watashi ni Tenshi ga Maiorita! (570 Epochs)",
+            "added": "2023-08-03",
+            "credit": "tamalik",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kaguya Shinomiya ",
+            "url": "https://huggingface.co/1ski/1skiRVCModels/resolve/main/kaguyav5.zip",
+            "description": "Kaguya Shinomiya from Kaguya-Sama Love is war (200 Epochs)",
+            "added": "2023-08-03",
+            "credit": "1ski",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Nai Shiro",
+            "url": "https://huggingface.co/kuushiro/Shiro-RVC-No-Game-No-Life/resolve/main/shiro-jp-360-epochs.zip",
+            "description": "Nai Shiro (Ai Kayano) from No Game No Life (360 Epochs)",
+            "added": "2023-08-03",
+            "credit": "kxouyou",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Yuigahama Yui",
+            "url": "https://huggingface.co/Zerokano/Yuigahama_Yui-RVCv2/resolve/main/Yuigahama_Yui.zip",
+            "description": "Yuigahama Yui from Yahari Ore no Seishun Love Comedy wa Machigatteiru (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "zerokano",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Fuwawa Abyssgard",
+            "url": "https://huggingface.co/megaaziib/my-rvc-models-collection/resolve/main/fuwawa.zip",
+            "description": "Fuwawa Abyssgard (FUWAMOCO) from Hololive gen 3 (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "megaaziib",
+            "tags": [
+                "Vtuber",
+                "English"
+            ]
+        },
+        {
+            "name": "Kana Arima",
+            "url": "https://huggingface.co/ddoumakunn/arimakanna/resolve/main/arimakanna.zip",
+            "description": "Kana Arima from Oshi no Ko (250 Epochs)",
+            "added": "2023-08-03",
+            "credit": "ddoumakunn",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Raiden Shogun",
+            "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/RaidenShogun/RaidenShogun.zip",
+            "description": "Raiden Shogun from Genshin Impact (310 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nasley",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Alhaitham",
+            "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Alhaitham/Alhaitham.zip",
+            "description": "Alhaitham from Genshin Impact (320 Epochs)",
+            "added": "2023-08-03",
+            "credit": "nasley",
+            "tags": [
+                "Game character",
+                "English"
+            ]
+        },
+        {
+            "name": "Izuku Midoriya",
+            "url": "https://huggingface.co/BigGuy635/MHA/resolve/main/DekuJP.zip",
+            "description": "Izuku Midoriya from Boku no Hero Academia (100 Epochs)",
+            "added": "2023-08-03",
+            "credit": "khjjnoffical",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kurumi Shiratori",
+            "url": "https://huggingface.co/HarunaKasuga/YoshikoTsushima/resolve/main/KurumiShiratori.zip",
+            "description": "Kurumi Shiratori (VA: Ruka Fukagawa) from D4DJ (500 Epochs)",
+            "added": "2023-08-03",
+            "credit": "seakrait",
+            "tags": [
+                "Anime",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Veibae",
+            "url": "https://huggingface.co/datasets/Papaquans/Veibae/resolve/main/veibae_e165_s125565.zip",
+            "description": "Veibae (165 Epochs)",
+            "added": "2023-08-03",
+            "credit": "recairo",
+            "tags": [
+                "Vtuber",
+                "English"
+            ]
+        },
+        {
+            "name": "Black Panther",
+            "url": "https://huggingface.co/TJKAI/BlackPannther/resolve/main/BlackPanther.zip",
+            "description": "Black Panther (Chadwick Boseman) (300 Epochs)",
+            "added": "2023-08-03",
+            "credit": "tjkcreative",
+            "tags": [
+                "Real person",
+                "English"
+            ]
+        },
+        {
+            "name": "Gawr Gura",
+            "url": "https://pixeldrain.com/u/3tJmABXA",
+            "description": "Gawr Gura from Hololive EN",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack",
+            "tags": [
+                "Vtuber"
+            ]
+        },
+        {
+            "name": "Houshou Marine",
+            "url": "https://pixeldrain.com/u/L1YLfZyU",
+            "description": "Houshou Marine from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Hoshimachi Suisei",
+            "url": "https://pixeldrain.com/u/YP89C21u",
+            "description": "Hoshimachi Suisei from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack & Maki Ligon",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Laplus Darkness",
+            "url": "https://pixeldrain.com/u/zmuxv5Bf",
+            "description": "Laplus Darkness from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "dacoolkid44 & hijack",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "AZKi",
+            "url": "https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models/resolve/main/AZKi%20(Hybrid).zip",
+            "description": "AZKi from Hololive JP",
+            "added": "2023-08-05",
+            "credit": "Kit Lemonfoot / NSHFB",
+            "tags": [
+                "Vtuber",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Ado",
+            "url": "https://huggingface.co/pjesek/AdoRVCv2/resolve/main/AdoRVCv2.zip",
+            "description": "Talented JP artist (500 epochs using every song from her first album)",
+            "added": "2023-08-05",
+            "credit": "pjesek",
+            "tags": [
+                "Real person",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "LiSA",
+            "url": "https://huggingface.co/phant0m4r/LiSA/resolve/main/LiSA.zip",
+            "description": "Talented JP artist (400 epochs)",
+            "added": "2023-08-05",
+            "credit": "Phant0m",
+            "tags": [
+                "Real person",
+                "Japanese"
+            ]
+        },
+        {
+            "name": "Kokomi",
+            "url": "https://huggingface.co/benitheworld/kokomi-kr/resolve/main/kokomi-kr.zip",
+            "description": "Kokomi from Genshin Impact KR (300 Epochs)",
+            "added": "2023-08-09",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Ivanzolo",
+            "url": "https://huggingface.co/fenikkusugosuto/IvanZolo2004/resolve/main/ivanZolo.zip",
+            "description": "Ivanzolo2004 russian streamer | Иван Золо 2004",
+            "added": "2023-08-09",
+            "credit": "prezervativ_naruto2009",
+            "tags": [
+                "Other Language",
+                "Real person"
+            ]
+        },
+        {
+            "name": "Nilou",
+            "url": "https://huggingface.co/benitheworld/nilou-kr/resolve/main/nilou-kr.zip",
+            "description": "Nilou from Genshin Impact KR (300 Epochs)",
+            "added": "2023-08-09",
+            "credit": "kannysoap",
+            "tags": [
+                "Game character",
+                "Other Language"
+            ]
+        },
+        {
+            "name": "Dr. Doofenshmirtz",
+            "url": "https://huggingface.co/Argax/doofenshmirtz-RUS/resolve/main/doofenshmirtz.zip",
+            "description": "RUS Dr. Doofenshmirtz from Phineas and Ferb  (300 epochs)",
+            "added": "2023-08-09",
+            "credit": "argaxus",
+            "tags": [
+                "Other Language"
+            ]
+        }
+    ]
+}

notebooks/ultimate_rvc_colab.ipynb ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kmyCzJVyCymN"
+      },
+      "source": [
+        "Colab for [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)\n",
+        "\n",
+        "This Colab notebook will **help** you if you don’t have a GPU or if your PC isn’t very powerful.\n",
+        "\n",
+        "Simply click `Runtime` in the top navigation bar and `Run all`. Wait for the output of the final cell to show the public gradio url and click on it.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "TfYDhnzOyig5"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 0: Initialize notebook\n",
+        "from IPython.display import clear_output\n",
+        "import threading\n",
+        "import time\n",
+        "import codecs\n",
+        "\n",
+        "DEPENDENCIES_PATH = \"./dependencies\"\n",
+        "VENV_PATH = f\"{DEPENDENCIES_PATH}/venv\"\n",
+        "BIN_PATH = f\"{VENV_PATH}/bin\"\n",
+        "\n",
+        "\n",
+        "def update_timer_and_print():\n",
+        "    global timer\n",
+        "    while True:\n",
+        "        hours, remainder = divmod(timer, 3600)\n",
+        "        minutes, seconds = divmod(remainder, 60)\n",
+        "        timer_str = f'{hours:02}:{minutes:02}:{seconds:02}'\n",
+        "        print(f'\\rTimer: {timer_str} ', end='', flush=True)  # Print without a newline\n",
+        "        time.sleep(1)\n",
+        "        timer += 1\n",
+        "\n",
+        "timer = 0\n",
+        "threading.Thread(target=update_timer_and_print, daemon=True).start()\n",
+        "\n",
+        "install_to_drive=False\n",
+        "if install_to_drive==True:\n",
+        "  from google.colab import drive\n",
+        "  drive.mount('/content/drive')\n",
+        "  %cd /content/drive/MyDrive\n",
+        "else:\n",
+        "  %cd /content\n",
+        "clear_output()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "aaokDv1VzpAX"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 1: Clone repository\n",
+        "cloneing=codecs.decode('uggcf://tvguho.pbz/WnpxvfzlFurcuneq/hygvzngr-eip.tvg','rot_13')\n",
+        "\n",
+        "!git clone $cloneing HRVC\n",
+        "%cd HRVC\n",
+        "clear_output()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "lVGNygIa0F_1"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 2: Install dependencies\n",
+        "inits = codecs.decode('./fep/vavg.cl','rot_13')\n",
+        "curly = codecs.decode(\"uggcf://uhttvatsnpr.pb/WnpxvfzlFurcuneq/hygvzngr-eip/erfbyir/znva/snvefrd-0.12.2-pc311-pc311-yvahk_k86_64.juy\",\"rot_13\")\n",
+        "destiny = codecs.decode('snvefrd-0.12.2-pc311-pc311-yvahk_k86_64.juy','rot_13')\n",
+        "\n",
+        "!apt install -y python3.11 python3.11-dev python3.11-venv\n",
+        "!apt install -y sox libsox-dev ffmpeg\n",
+        "\n",
+        "!curl -LJ -o $DEPENDENCIES_PATH/$destiny --create-dirs $curly\n",
+        "!python3.11 -m venv $VENV_PATH --upgrade-deps\n",
+        "\n",
+        "! $BIN_PATH/pip install -r requirements.txt\n",
+        "! $BIN_PATH/pip install faiss-cpu==1.7.3\n",
+        "! $BIN_PATH/python $inits\n",
+        "clear_output()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "lVGNygIa0F_2"
+      },
+      "outputs": [],
+      "source": [
+        "#@title 3: Run Ultimate RVC\n",
+        "runpice = codecs.decode('./fep/ncc.cl','rot_13')\n",
+        "\n",
+        "!$BIN_PATH/python $runpice --share --listen-port 9999"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,105 @@

+[tool.pyright]
+stubPath = "src/typings"
+pythonVersion = "3.11"
+pythonPlatform = "All"
+typeCheckingMode = "strict"
+ignore = ["**/.venv"]
+[tool.black]
+target-version = ['py311']
+preview = true
+enable-unstable-feature = ["string_processing"]
+[tool.ruff]
+extend-include = ["*.ipynb"]
+target-version = "py311"
+fix = true
+required-version = ">=0.5.7"
+[tool.ruff.format]
+docstring-code-format = true
+[tool.ruff.lint]
+#select = ["ALL"]
+extend-select = ["I"]
+ignore = ["D205", "D203", "D212", "D416"]
+unfixable = ["F401"]
+preview = true
+[tool.ruff.lint.flake8-annotations]
+ignore-fully-untyped = true
+#suppress-none-returning = true
+[tool.ruff.lint.flake8-errmsg]
+#max-string-length = 20
+[tool.ruff.lint.isort]
+relative-imports-order = "closest-to-furthest"
+section-order = [
+    "future",
+    "typing",
+    "standard-library",
+    "third-party",
+    "networking",
+    "data-science",
+    "machine-learning",
+    "audio",
+    "visualisation",
+    "first-party",
+    "vc",
+    "backend",
+    "frontend",
+    "base",
+    "local-folder",
+]
+[tool.ruff.lint.isort.sections]
+"typing" = ["typing", "typing_extensions", "typings"]
+"networking" = [
+    "requests",
+    "yt_dlp",
+    "deemix",
+    "wget",
+    "flask",
+    "beautifulsoup4",
+    "pypresence",
+]
+"data-science" = ["numpy", "scipy", "matplotlib", "tqdm", "pandas", "gradio"]
+"machine-learning" = [
+    "torch",
+    "torchaudio",
+    "torchcrepe",
+    "fairseq",
+    "faiss",
+    "tensorboard",
+    "torchfcpe",
+    "local_attention",
+    "libf0",
+    "einops",
+    "numba",
+]
+"audio" = [
+    "ffmpeg",
+    "soundfile",
+    "librosa",
+    "sox",
+    "pydub",
+    "pedalboard",
+    "audio_separator",
+    "parselmouth",
+    "pyworld",
+    "noisereduce",
+    "audio_upscaler",
+    "edge_tts",
+    "ffmpy",
+]
+"vc" = ["vc"]
+"backend" = ["backend"]
+"frontend" = ["frontend"]
+"base" = ["common", "app", "cli", "init"]
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 72
+[tool.ruff.lint.pylint]
+# max-args = 10

requirements.txt ADDED Viewed

	@@ -0,0 +1,55 @@

+# General
+lib==4.0.0
+# Networking
+requests==2.32.3 #NOTE upgraded from 2.32.0
+yt_dlp==2024.8.6
+#TODO add these later
+# deemix
+# wget
+# flask
+# beautifulsoup4
+# pypresence
+# Data science
+numpy==1.23.5
+scipy~=1.13.0 # NOTE upgraded from 1.11.1
+matplotlib==3.9.0 #NOTE upgraded from 3.7.2
+tqdm==4.65.0 #NOTE upgraded from unspecified
+gradio==4.43.0
+# Machine learning
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==2.1.1+cu121 # NOTE upgraded from 2.0.1+cu118
+torchaudio==2.1.1+cu121
+torchcrepe==0.0.23 # NOTE upgraded from 0.0.20
+./dependencies/fairseq-0.12.2-cp311-cp311-linux_x86_64.whl; sys_platform == 'linux'
+./dependencies/fairseq-0.12.3.1-cp311-cp311-win_amd64.whl; sys_platform == 'win32'
+./dependencies/diffq-0.2.4-cp311-cp311-win_amd64.whl; sys_platform == 'win32'
+tensorboardX
+#TODO add these later
+# faiss-cpu==1.7.3 # NOTE outcommented due to incompatibility on windows
+# tensorboard
+# torchfcpe
+# local-attention
+# libf0
+# einops
+# numba; sys_platform == 'linux'
+# numba==0.57.0; sys_platform == 'darwin' or sys_platform == 'win32'
+# Audio
+ffmpeg-python>=0.2.0
+soundfile==0.12.1
+librosa >=0.10 # NOTE upgraded from 0.9.2
+sox==1.5.0
+pydub==0.25.1
+pydub-stubs
+pedalboard==0.9.12
+audio-separator[gpu]==0.18.3
+praat-parselmouth>=0.4.2 # NOTE upgraded from unspecified
+pyworld==0.3.4
+#TODO add the later
+# noisereduce
+# audio_upscaler==0.1.4
+# edge-tts==6.1.9
+# ffmpy==0.3.1

src/app.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""
+Main application for the Ultimate RVC project.
+Each tab of the application is defined in a separate module
+in the `frontend/tabs` directory.
+Components that are accessed across multiple tabs are passed as arguments
+to the render functions in the respective modules.
+"""
+import asyncio
+import os
+from argparse import ArgumentParser
+import gradio as gr
+from backend.generate_song_cover import get_named_song_dirs
+from backend.manage_audio import delete_gradio_temp_dir, get_output_audio
+from backend.manage_voice_models import get_current_models
+from frontend.tabs.manage_audio import render as render_manage_audio_tab
+from frontend.tabs.manage_models import render as render_manage_models_tab
+from frontend.tabs.multi_step_generation import render as render_multi_step_tab
+from frontend.tabs.one_click_generation import render as render_one_click_tab
+from common import GRADIO_TEMP_DIR
+def _init_app() -> tuple[gr.Dropdown, ...]:
+    """
+    Initialize app by deleting any existing Gradio temp directory
+    and updating the choices of all dropdowns.
+    Returns
+    -------
+    tuple[gr.Dropdown, ...]
+        Updated dropdowns for selecting voice models, song directories,
+        and output audio files.
+    """
+    delete_gradio_temp_dir()
+    updated_rvc_model_dropdowns = tuple(
+        gr.Dropdown(choices=get_current_models()) for _ in range(3)
+    )
+    updated_song_dir_dropdowns = tuple(
+        gr.Dropdown(choices=get_named_song_dirs()) for _ in range(10)
+    )
+    updated_output_audio_dropdown = (gr.Dropdown(choices=get_output_audio()),)
+    return (
+        updated_rvc_model_dropdowns
+        + updated_song_dir_dropdowns
+        + updated_output_audio_dropdown
+    )
+os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
+if os.name == "nt":
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+with gr.Blocks(title="Ultimate RVC") as app:
+    gr.Label("Ultimate RVC ❤️", show_label=False)
+    dummy_deletion_checkbox = gr.Checkbox(visible=False)
+    delete_confirmation = gr.State(False)
+    song_dir_dropdowns = [
+        gr.Dropdown(
+            label="Song directory",
+            info=(
+                "Directory where intermediate audio files are stored and loaded from"
+                " locally. When a new song is retrieved, its directory is chosen by"
+                " default."
+            ),
+            render=False,
+        )
+        for _ in range(7)
+    ]
+    cached_input_songs_dropdown_1click, cached_input_songs_dropdown_multi = [
+        gr.Dropdown(
+            label="Song input",
+            info="Select a song from the list of cached songs.",
+            visible=False,
+            render=False,
+        )
+        for _ in range(2)
+    ]
+    intermediate_audio_to_delete = gr.Dropdown(
+        label="Songs with intermediate audio files",
+        multiselect=True,
+        info=(
+            "Select one or more songs to delete their asssociated intermediate audio"
+            " files."
+        ),
+        render=False,
+    )
+    output_audio_to_delete = gr.Dropdown(
+        label="Output audio files",
+        multiselect=True,
+        info="Select one or more output audio files to delete.",
+        render=False,
+    )
+    rvc_model_1click, rvc_model_multi = [
+        gr.Dropdown(label="Voice model", render=False) for _ in range(2)
+    ]
+    rvc_models_to_delete = gr.Dropdown(
+        label="Voice models", multiselect=True, render=False
+    )
+    generate_buttons = [
+        gr.Button(label, variant="primary", render=False, scale=scale)
+        for label, scale, in [
+            ("Retrieve song", 1),
+            ("Separate vocals/instrumentals", 1),
+            ("Separate main/backup vocals", 1),
+            ("De-reverb vocals", 1),
+            ("Convert vocals", 1),
+            ("Post-process vocals", 1),
+            ("Pitch shift background", 1),
+            ("Mix song cover", 1),
+            ("Generate", 2),
+        ]
+    ]
+    # main tab
+    with gr.Tab("Generate song covers"):
+        render_one_click_tab(
+            generate_buttons,
+            song_dir_dropdowns,
+            cached_input_songs_dropdown_1click,
+            cached_input_songs_dropdown_multi,
+            rvc_model_1click,
+            intermediate_audio_to_delete,
+            output_audio_to_delete,
+        )
+        render_multi_step_tab(
+            generate_buttons,
+            song_dir_dropdowns,
+            cached_input_songs_dropdown_1click,
+            cached_input_songs_dropdown_multi,
+            rvc_model_multi,
+            intermediate_audio_to_delete,
+            output_audio_to_delete,
+        )
+    with gr.Tab("Manage models"):
+        render_manage_models_tab(
+            dummy_deletion_checkbox,
+            delete_confirmation,
+            rvc_models_to_delete,
+            rvc_model_1click,
+            rvc_model_multi,
+        )
+    with gr.Tab("Manage audio"):
+        render_manage_audio_tab(
+            dummy_deletion_checkbox,
+            delete_confirmation,
+            song_dir_dropdowns,
+            cached_input_songs_dropdown_1click,
+            cached_input_songs_dropdown_multi,
+            intermediate_audio_to_delete,
+            output_audio_to_delete,
+        )
+    app.load(
+        _init_app,
+        outputs=[
+            rvc_model_1click,
+            rvc_model_multi,
+            rvc_models_to_delete,
+            intermediate_audio_to_delete,
+            cached_input_songs_dropdown_1click,
+            cached_input_songs_dropdown_multi,
+            *song_dir_dropdowns,
+            output_audio_to_delete,
+        ],
+        show_progress="hidden",
+    )
+    app.unload(delete_gradio_temp_dir)
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Generate a song cover song in the song_output/id directory.",
+        add_help=True,
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        dest="share_enabled",
+        default=False,
+        help="Enable sharing",
+    )
+    parser.add_argument(
+        "--listen",
+        action="store_true",
+        default=False,
+        help="Make the WebUI reachable from your local network.",
+    )
+    parser.add_argument(
+        "--listen-host", type=str, help="The hostname that the server will use."
+    )
+    parser.add_argument(
+        "--listen-port", type=int, help="The listening port that the server will use."
+    )
+    args = parser.parse_args()
+    app.queue()
+    app.launch(
+        share=args.share_enabled,
+        server_name=None if not args.listen else (args.listen_host or "0.0.0.0"),
+        server_port=args.listen_port,
+    )

src/backend/common.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""Common utility functions for the backend."""
+from typing import Any
+from typings.extra import StrOrBytesPath
+import hashlib
+import json
+import os
+import shutil
+import gradio as gr
+from backend.exceptions import PathNotFoundError
+from common import AUDIO_DIR, RVC_MODELS_DIR
+INTERMEDIATE_AUDIO_DIR = os.path.join(AUDIO_DIR, "intermediate")
+OUTPUT_AUDIO_DIR = os.path.join(AUDIO_DIR, "output")
+def display_progress(
+    message: str,
+    percentage: float | None = None,
+    progress_bar: gr.Progress | None = None,
+) -> None:
+    """
+    Display progress message and percentage in console or Gradio progress bar.
+    Parameters
+    ----------
+    message : str
+        Message to display.
+    percentage : float, optional
+        Percentage to display.
+    progress_bar : gr.Progress, optional
+        The Gradio progress bar to update.
+    """
+    if progress_bar is None:
+        print(message)
+    else:
+        progress_bar(percentage, desc=message)
+def remove_suffix_after(text: str, occurrence: str) -> str:
+    """
+    Remove suffix after the first occurrence of a substring in a string.
+    Parameters
+    ----------
+    text : str
+        The string to remove the suffix from.
+    occurrence : str
+        The substring to remove the suffix after.
+    Returns
+    -------
+    str
+        The string with the suffix removed.
+    """
+    location = text.rfind(occurrence)
+    if location == -1:
+        return text
+    else:
+        return text[: location + len(occurrence)]
+def copy_files_to_new_folder(file_paths: list[str], folder_path: str) -> None:
+    """
+    Copy files to a new folder.
+    Parameters
+    ----------
+    file_paths : list[str]
+        List of file paths to copy.
+    folder_path : str
+        Path of the folder to copy the files to.
+    Raises
+    ------
+    PathNotFoundError
+        If a file does not exist.
+    """
+    os.makedirs(folder_path)
+    for file_path in file_paths:
+        if not os.path.exists(file_path):
+            raise PathNotFoundError(f"File not found: {file_path}")
+        shutil.copyfile(
+            file_path, os.path.join(folder_path, os.path.basename(file_path))
+        )
+def get_path_stem(path: str) -> str:
+    """
+    Get the stem of a file path.
+    The stem is the name of the file that the path points to,
+    not including its extension.
+    Parameters
+    ----------
+    path : str
+        The file path.
+    Returns
+    -------
+    str
+        The stem of the file path.
+    """
+    return os.path.splitext(os.path.basename(path))[0]
+def json_dumps(thing: Any) -> str:
+    """
+    Dump a Python object to a JSON string.
+    Parameters
+    ----------
+    thing : Any
+        The object to dump.
+    Returns
+    -------
+    str
+        The JSON string representation of the object.
+    """
+    return json.dumps(
+        thing, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ": ")
+    )
+def json_dump(thing: Any, path: StrOrBytesPath) -> None:
+    """
+    Dump a Python object to a JSON file.
+    Parameters
+    ----------
+    thing : Any
+        The object to dump.
+    path : str
+        The path of the JSON file.
+    """
+    with open(path, "w", encoding="utf-8") as file:
+        json.dump(
+            thing,
+            file,
+            ensure_ascii=False,
+            sort_keys=True,
+            indent=4,
+            separators=(",", ": "),
+        )
+def json_load(path: StrOrBytesPath, encoding: str = "utf-8") -> Any:
+    """
+    Load a Python object from a JSON file.
+    Parameters
+    ----------
+    path : str
+        The path of the JSON file.
+    encoding : str, default='utf-8'
+        The encoding of the file.
+    Returns
+    -------
+    Any
+        The Python object loaded from the JSON file.
+    """
+    with open(path, encoding=encoding) as file:
+        return json.load(file)
+def get_hash(thing: Any, size: int = 5) -> str:
+    """
+    Get a hash of a Python object.
+    Parameters
+    ----------
+    thing : Any
+        The object to hash.
+    size : int, default=5
+        The size of the hash in bytes.
+    Returns
+    -------
+    str
+        The hash of the object.
+    """
+    return hashlib.blake2b(
+        json_dumps(thing).encode("utf-8"), digest_size=size
+    ).hexdigest()
+# TODO consider increasing size to 16
+# otherwise we might have problems with hash collisions
+def get_file_hash(filepath: StrOrBytesPath, size: int = 5) -> str:
+    """
+    Get the hash of a file.
+    Parameters
+    ----------
+    filepath : str
+        The path of the file.
+    size : int, default=5
+        The size of the hash in bytes.
+    Returns
+    -------
+    str
+        The hash of the file.
+    """
+    with open(filepath, "rb") as f:
+        file_hash = hashlib.file_digest(f, lambda: hashlib.blake2b(digest_size=size))
+    return file_hash.hexdigest()
+def get_rvc_model(voice_model: str) -> tuple[str, str]:
+    """
+    Get the RVC model file and optional index file for a voice model.
+    When no index file exists, an empty string is returned.
+    Parameters
+    ----------
+    voice_model : str
+        The name of the voice model.
+    Returns
+    -------
+    model_path : str
+        The path of the RVC model file.
+    index_path : str
+        The path of the RVC index file.
+    Raises
+    ------
+    PathNotFoundError
+        If the directory of the voice model does not exist or
+        if no model file exists in the directory.
+    """
+    rvc_model_filename, rvc_index_filename = None, None
+    model_dir = os.path.join(RVC_MODELS_DIR, voice_model)
+    if not os.path.exists(model_dir):
+        raise PathNotFoundError(
+            f"Voice model directory '{voice_model}' does not exist."
+        )
+    for file in os.listdir(model_dir):
+        ext = os.path.splitext(file)[1]
+        if ext == ".pth":
+            rvc_model_filename = file
+        if ext == ".index":
+            rvc_index_filename = file
+    if rvc_model_filename is None:
+        raise PathNotFoundError(f"No model file exists in {model_dir}.")
+    return os.path.join(model_dir, rvc_model_filename), (
+        os.path.join(model_dir, rvc_index_filename) if rvc_index_filename else ""
+    )

src/backend/exceptions.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+This module contains custom exceptions that are raised by the backend.
+"""
+class InputMissingError(ValueError):
+    """
+    Raised when an input is missing.
+    """
+    pass
+class InvalidPathError(OSError):
+    """
+    Raised when a path is invalid.
+    """
+    pass
+class PathNotFoundError(OSError):
+    """
+    Raised when a path is not found.
+    """
+    pass
+class PathExistsError(OSError):
+    """
+    Raised when a path already exists.
+    """
+    pass
+class FileTypeError(ValueError):
+    """
+    Raised when a file is of the wrong type.
+    """
+    pass

src/backend/generate_song_cover.py ADDED Viewed

	@@ -0,0 +1,1679 @@

+"""
+This module contains functions to generate song covers using RVC-based voice models.
+"""
+from typing import Any
+from typings.extra import F0Method, InputAudioExt, InputType, OutputAudioExt
+import gc
+import glob
+import os
+import shlex
+import shutil
+import subprocess
+from contextlib import suppress
+from logging import WARNING
+from pathlib import Path, PurePath
+from urllib.parse import parse_qs, urlparse
+import yt_dlp
+import gradio as gr
+import soundfile as sf
+import sox
+from audio_separator.separator import Separator
+from pedalboard import Compressor, HighpassFilter, Reverb
+from pedalboard._pedalboard import Pedalboard
+from pedalboard.io import AudioFile
+from pydub import AudioSegment
+from pydub import utils as pydub_utils
+from vc.rvc import Config, get_vc, load_hubert, rvc_infer
+from backend.common import (
+    INTERMEDIATE_AUDIO_DIR,
+    OUTPUT_AUDIO_DIR,
+    display_progress,
+    get_file_hash,
+    get_hash,
+    get_path_stem,
+    get_rvc_model,
+    json_dump,
+    json_load,
+)
+from backend.exceptions import InputMissingError, InvalidPathError, PathNotFoundError
+from common import RVC_MODELS_DIR, SEPARATOR_MODELS_DIR
+SEPARATOR = Separator(
+    log_level=WARNING,
+    model_file_dir=SEPARATOR_MODELS_DIR,
+    output_dir=INTERMEDIATE_AUDIO_DIR,
+    mdx_params={
+        "hop_length": 1024,
+        "segment_size": 256,
+        "overlap": 0.001,
+        "batch_size": 1,
+        "enable_denoise": False,
+    },
+    mdxc_params={"segment_size": 256, "batch_size": 1, "overlap": 2},
+)
+def _get_youtube_video_id(url: str, ignore_playlist: bool = True) -> str | None:
+    """
+    Get video id from a YouTube URL.
+    Parameters
+    ----------
+    url : str
+        The YouTube URL.
+    ignore_playlist : bool, default=True
+        Whether to get id of first video in playlist or the playlist id itself.
+    Returns
+    -------
+    str
+        The video id.
+    """
+    query = urlparse(url)
+    if query.hostname == "youtu.be":
+        if query.path[1:] == "watch":
+            return query.query[2:]
+        return query.path[1:]
+    if query.hostname in {"www.youtube.com", "youtube.com", "music.youtube.com"}:
+        if not ignore_playlist:
+            # use case: get playlist id not current video in playlist
+            with suppress(KeyError):
+                return parse_qs(query.query)["list"][0]
+        if query.path == "/watch":
+            return parse_qs(query.query)["v"][0]
+        if query.path[:7] == "/watch/":
+            return query.path.split("/")[1]
+        if query.path[:7] == "/embed/":
+            return query.path.split("/")[2]
+        if query.path[:3] == "/v/":
+            return query.path.split("/")[2]
+    return None
+def _yt_download(link: str, song_dir: str) -> str:
+    """
+    Download audio from a YouTube link.
+    Parameters
+    ----------
+    link : str
+        The YouTube link.
+    song_dir : str
+        The directory to save the downloaded audio to.
+    Returns
+    -------
+    str
+        The path to the downloaded audio file.
+    """
+    outtmpl = os.path.join(song_dir, "0_%(title)s_Original")
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "format": "bestaudio",
+        "outtmpl": outtmpl,
+        "ignoreerrors": True,
+        "nocheckcertificate": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": 0,
+            }
+        ],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        result = ydl.extract_info(link, download=True)
+        if not result:
+            raise PathNotFoundError("No audio found in the provided YouTube link!")
+        download_path = ydl.prepare_filename(result, outtmpl=f"{outtmpl}.wav")
+    return download_path
+def _get_input_audio_paths() -> list[str]:
+    """
+    Get the paths of all cached input audio files.
+    Returns
+    -------
+    list[str]
+        The paths of all cached input audio files
+    """
+    # TODO if we later add .json file for input then we need to exclude those here
+    return glob.glob(os.path.join(INTERMEDIATE_AUDIO_DIR, "*", "0_*_Original*"))
+def _get_input_audio_path(song_dir: str) -> str | None:
+    """
+    Get the path of the cached input audio file in a given song directory.
+    Parameters
+    ----------
+    song_dir : str
+        The path to a song directory.
+    Returns
+    -------
+    str
+        The path of the cached input audio file, if it exists.
+    """
+    # NOTE orig_song_paths should never contain more than one element
+    return next(iter(glob.glob(os.path.join(song_dir, "0_*_Original*"))), None)
+def _pitch_shift(audio_path: str, output_path: str, n_semi_tones: int) -> None:
+    """
+    Pitch-shift an audio file.
+    Parameters
+    ----------
+    audio_path : str
+        The path of the audio file to pitch-shift.
+    output_path : str
+        The path to save the pitch-shifted audio file to.
+    n_semi_tones : int
+        The number of semi-tones to pitch-shift the audio by.
+    """
+    y, sr = sf.read(audio_path)
+    tfm = sox.Transformer()
+    tfm.pitch(n_semi_tones)
+    y_shifted = tfm.build_array(input_array=y, sample_rate_in=sr)
+    sf.write(output_path, y_shifted, sr)
+# TODO consider increasing hash_size to 16
+# otherwise we might have problems with hash collisions
+# when using app as CLI
+def _get_unique_base_path(
+    song_dir: str,
+    prefix: str,
+    arg_dict: dict[str, Any],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+    hash_size: int = 5,
+) -> str:
+    """
+    Get a unique base path for an audio file in a song directory
+    by hashing the arguments used to generate the audio.
+    Parameters
+    ----------
+    song_dir : str
+        The path to a song directory.
+    prefix : str
+        The prefix to use for the base path.
+    arg_dict : dict
+        The dictionary of arguments used to generate the audio in the given file.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    hash_size : int, default=5
+        The size (in bytes) of the hash to use for the base path.
+    Returns
+    -------
+    str
+        The unique base path for the audio file.
+    """
+    dict_hash = get_hash(arg_dict, size=hash_size)
+    while True:
+        base_path = os.path.join(song_dir, f"{prefix}_{dict_hash}")
+        json_path = f"{base_path}.json"
+        if os.path.exists(json_path):
+            file_dict = json_load(json_path)
+            if file_dict == arg_dict:
+                return base_path
+            display_progress("[~] Rehashing...", percentage, progress_bar)
+            dict_hash = get_hash(dict_hash, size=hash_size)
+        else:
+            return base_path
+def _convert_voice(
+    voice_model: str,
+    voice_path: str,
+    output_path: str,
+    pitch_change: int,
+    f0_method: F0Method,
+    index_rate: float,
+    filter_radius: int,
+    rms_mix_rate: float,
+    protect: float,
+    crepe_hop_length: int,
+    output_sr: int,
+) -> None:
+    """
+    Convert a voice track using a voice model.
+    Parameters
+    ----------
+    voice_model : str
+        The name of the voice model to use.
+    voice_path : str
+        The path to the voice track to convert.
+    output_path : str
+        The path to save the converted voice to.
+    pitch_change : int
+        The number of semi-tones to pitch-shift the converted voice by.
+    f0_method : F0Method
+        The method to use for pitch extraction.
+    index_rate : float
+        The influence of index file on voice conversion.
+    filter_radius : int
+        The filter radius to use for the voice conversion.
+    rms_mix_rate : float
+        The blending rate of the volume envelope of converted voice.
+    protect : float
+        The protection rate for consonants and breathing sounds.
+    crepe_hop_length : int
+        The hop length to use for Crepe pitch extraction method.
+    output_sr : int
+        The sample rate to use for the output audio.
+    """
+    rvc_model_path, rvc_index_path = get_rvc_model(voice_model)
+    device = "cuda:0"
+    config = Config(device, True)
+    hubert_model = load_hubert(
+        device, config.is_half, os.path.join(RVC_MODELS_DIR, "hubert_base.pt")
+    )
+    cpt, version, net_g, tgt_sr, vc = get_vc(
+        device, config.is_half, config, rvc_model_path
+    )
+    # convert main vocals
+    rvc_infer(
+        rvc_index_path,
+        index_rate,
+        voice_path,
+        output_path,
+        pitch_change,
+        f0_method,
+        cpt,
+        version,
+        net_g,
+        filter_radius,
+        tgt_sr,
+        rms_mix_rate,
+        protect,
+        crepe_hop_length,
+        vc,
+        hubert_model,
+        output_sr,
+    )
+    del hubert_model, cpt
+    gc.collect()
+def _add_audio_effects(
+    audio_path: str,
+    output_path: str,
+    reverb_rm_size: float,
+    reverb_wet: float,
+    reverb_dry: float,
+    reverb_damping: float,
+) -> None:
+    """
+    Add high-pass filter, compressor and reverb effects to an audio file.
+    Parameters
+    ----------
+    audio_path : str
+        The path of the audio file to add effects to.
+    output_path : str
+        The path to save the effected audio file to.
+    reverb_rm_size : float
+        The room size of the reverb effect.
+    reverb_wet : float
+        The wet level of the reverb effect.
+    reverb_dry : float
+        The dry level of the reverb effect.
+    reverb_damping : float
+        The damping of the reverb effect.
+    """
+    board = Pedalboard(
+        [
+            HighpassFilter(),
+            Compressor(ratio=4, threshold_db=-15),
+            Reverb(
+                room_size=reverb_rm_size,
+                dry_level=reverb_dry,
+                wet_level=reverb_wet,
+                damping=reverb_damping,
+            ),
+        ]
+    )
+    with AudioFile(audio_path) as f:
+        with AudioFile(output_path, "w", f.samplerate, f.num_channels) as o:
+            # Read one second of audio at a time, until the file is empty:
+            while f.tell() < f.frames:
+                chunk = f.read(int(f.samplerate))
+                effected = board(chunk, f.samplerate, reset=False)
+                o.write(effected)
+def _map_audio_ext(input_audio_ext: InputAudioExt) -> OutputAudioExt:
+    """
+    Map an input audio extension to an output audio extension.
+    Parameters
+    ----------
+    input_audio_ext : InputAudioExt
+        The input audio extension.
+    Returns
+    -------
+    OutputAudioExt
+        The output audio extension.
+    """
+    match input_audio_ext:
+        case "m4a":
+            return "ipod"
+        case "aac":
+            return "adts"
+        case _:
+            return input_audio_ext
+def _mix_audio(
+    main_vocal_path: str,
+    backup_vocal_path: str,
+    instrumental_path: str,
+    main_gain: int,
+    backup_gain: int,
+    inst_gain: int,
+    output_format: InputAudioExt,
+    output_sr: int,
+    output_path: str,
+) -> None:
+    """
+    Mix main vocals, backup vocals and instrumentals.
+    Parameters
+    ----------
+    main_vocal_path : str
+        The path of an audio file containing main vocals.
+    backup_vocal_path : str
+        The path of an audio file containing backup vocals.
+    instrumental_path : str
+        The path of an audio file containing instrumentals.
+    main_gain : int
+        The gain to apply to the main vocals.
+    backup_gain : int
+        The gain to apply to the backup vocals.
+    inst_gain : int
+        The gain to apply to the instrumental.
+    output_format : InputAudioExt
+        The format to save the mixed audio file in.
+    output_sr : int
+        The sample rate to use for the mixed audio file.
+    output_path : str
+        The path to save the mixed audio file to.
+    """
+    main_vocal_audio = AudioSegment.from_wav(main_vocal_path) + main_gain
+    backup_vocal_audio = AudioSegment.from_wav(backup_vocal_path) + backup_gain
+    instrumental_audio = AudioSegment.from_wav(instrumental_path) + inst_gain
+    combined_audio = main_vocal_audio.overlay(backup_vocal_audio).overlay(
+        instrumental_audio
+    )
+    combined_audio_resampled = combined_audio.set_frame_rate(output_sr)
+    mapped_output_format = _map_audio_ext(output_format)
+    combined_audio_resampled.export(output_path, format=mapped_output_format)
+def get_named_song_dirs() -> list[tuple[str, str]]:
+    """
+    Get the names and paths of all song directories.
+    Returns
+    -------
+    list[tuple[str, str]]
+        A list of tuples containing the name and path of each song directory.
+    """
+    input_paths = _get_input_audio_paths()
+    named_song_dirs: list[tuple[str, str]] = []
+    for path in input_paths:
+        song_dir, song_basename = os.path.split(path)
+        song_name = (
+            os.path.splitext(song_basename)[0]
+            .removeprefix("0_")
+            .removesuffix("_Original")
+        )
+        named_song_dirs.append((song_name, song_dir))
+    return sorted(named_song_dirs, key=lambda x: x[0])
+def convert_to_stereo(
+    song_path: str,
+    song_dir: str,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Converts an audio file to stereo.
+    Parameters
+    ----------
+    song_path : str
+        The path to the audio file to convert.
+    song_dir : str
+        The path to the directory where the stereo audio file will be saved.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        The path to the stereo audio file.
+    Raises
+    ------
+    InputMissingError
+        If no audio file or song directory path is provided.
+    PathNotFoundError
+        If the provided audio file or song directory path does not point
+        to an existing file or directory.
+    """
+    if not song_path:
+        raise InputMissingError("Input song missing!")
+    if not os.path.isfile(song_path):
+        raise PathNotFoundError("Input song does not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("Song directory does not exist!")
+    stereo_path = song_path
+    song_info = pydub_utils.mediainfo(song_path)
+    if song_info["channels"] == "1":
+        arg_dict = {
+            "input-files": [
+                {"name": os.path.basename(song_path), "hash": get_file_hash(song_path)}
+            ],
+        }
+        stereo_path_base = _get_unique_base_path(
+            song_dir, "0_Stereo", arg_dict, progress_bar, percentage
+        )
+        stereo_path = f"{stereo_path_base}.wav"
+        stereo_json_path = f"{stereo_path_base}.json"
+        if not (os.path.exists(stereo_path) and os.path.exists(stereo_json_path)):
+            display_progress(
+                "[~] Converting song to stereo...", percentage, progress_bar
+            )
+            command = shlex.split(
+                f'ffmpeg -y -loglevel error -i "{song_path}" -ac 2 -f wav'
+                f' "{stereo_path}"'
+            )
+            subprocess.run(command)
+            json_dump(arg_dict, stereo_json_path)
+    return stereo_path
+def _make_song_dir(
+    song_input: str, progress_bar: gr.Progress | None = None, percentage: float = 0.0
+) -> tuple[str, InputType]:
+    """
+    Create a song directory for a given song input.
+    * If the song input is a YouTube URL,
+    the song directory will be named after the video id.
+    * If the song input is a local audio file,
+    the song directory will be named after the file hash.
+    * if the song input is a song directory,
+    the song directory will be used as is.
+    Parameters
+    ----------
+    song_input : str
+        The song input to create a directory for.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    song_dir : str
+        The path to the created song directory.
+    input_type : InputType
+        The type of input provided.
+    Raises
+    ------
+    InputMissingError
+        If no song input is provided.
+    InvalidPathError
+        If the provided YouTube URL is invalid or if the provided song directory
+        is not located in the root of the intermediate audio directory.
+    PathNotFoundError
+        If the provided song input is neither a valid HTTPS-based URL
+        nor the path of an existing song directory or audio file.
+    """
+    # if song directory
+    if os.path.isdir(song_input):
+        if not PurePath(song_input).parent == PurePath(INTERMEDIATE_AUDIO_DIR):
+            raise InvalidPathError(
+                "Song directory not located in the root of the intermediate audio"
+                " directory."
+            )
+        display_progress(
+            "[~] Using existing song directory...", percentage, progress_bar
+        )
+        input_type = "local"
+        return song_input, input_type
+    display_progress("[~] Creating song directory...", percentage, progress_bar)
+    # if youtube url
+    if urlparse(song_input).scheme == "https":
+        input_type = "yt"
+        song_id = _get_youtube_video_id(song_input)
+        if song_id is None:
+            raise InvalidPathError("Invalid YouTube url!")
+    # if local audio file
+    elif os.path.isfile(song_input):
+        input_type = "local"
+        song_id = get_file_hash(song_input)
+    else:
+        raise PathNotFoundError(f"Song input {song_input} does not exist.")
+    song_dir = os.path.join(INTERMEDIATE_AUDIO_DIR, song_id)
+    Path(song_dir).mkdir(parents=True, exist_ok=True)
+    return song_dir, input_type
+def retrieve_song(
+    song_input: str,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float, float] = (0, 0.33, 0.67),
+) -> tuple[str, str]:
+    """
+    Retrieve a song from a YouTube URL, local audio file or a song directory.
+    Parameters
+    ----------
+    song_input : str
+        A Youtube URL, the path of a local audio file
+        or the path of a song directory.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float,float,float], default=(0, 0.33, 0.67)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    song_path : str
+        The path to the retrieved audio file
+    song_dir : str
+        The path to the song directory containing it.
+    Raises
+    ------
+    InputMissingError
+        If no song input is provided.
+    InvalidPathError
+        If the provided Youtube URL is invalid or if the provided song directory
+        is not located in the root of the intermediate audio directory.
+    PathNotFoundError
+        If the provided song input is neither a valid HTTPS-based URL
+        nor the path of an existing song directory or audio file.
+    """
+    if not song_input:
+        raise InputMissingError(
+            "Song input missing! Please provide a valid YouTube url, local audio file"
+            " path or cached song directory path."
+        )
+    song_dir, input_type = _make_song_dir(song_input, progress_bar, percentages[0])
+    orig_song_path = _get_input_audio_path(song_dir)
+    if not orig_song_path:
+        if input_type == "yt":
+            display_progress("[~] Downloading song...", percentages[1], progress_bar)
+            song_link = song_input.split("&")[0]
+            orig_song_path = _yt_download(song_link, song_dir)
+        else:
+            display_progress("[~] Copying song...", percentages[1], progress_bar)
+            song_input_base = os.path.basename(song_input)
+            song_input_name, song_input_ext = os.path.splitext(song_input_base)
+            orig_song_name = f"0_{song_input_name}_Original"
+            orig_song_path = os.path.join(song_dir, orig_song_name + song_input_ext)
+            shutil.copyfile(song_input, orig_song_path)
+    stereo_path = convert_to_stereo(
+        orig_song_path, song_dir, progress_bar, percentages[2]
+    )
+    return stereo_path, song_dir
+def separate_vocals(
+    song_path: str,
+    song_dir: str,
+    stereofy: bool = True,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> tuple[str, str]:
+    """
+    Separate a song into vocals and instrumentals.
+    Parameters
+    ----------
+    song_path : str
+        The path to the song to separate.
+    song_dir : str
+        The path to the song directory where the
+        separated vocals and instrumentals will be saved.
+    stereofy : bool, default=True
+        Whether to convert the song to stereo
+        before separating its vocals and instrumentals.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float,float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    vocals_path : str
+        The path to the separated vocals.
+    instrumentals_path : str
+        The path to the separated instrumentals.
+    Raises
+    ------
+    InputMissingError
+        If no song path or song directory path is provided.
+    PathNotFoundError
+        If the provided song path or song directory path does not point
+        to an existing file or directory.
+    """
+    if not song_path:
+        raise InputMissingError("Input song missing!")
+    if not os.path.isfile(song_path):
+        raise PathNotFoundError("Input song does not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("Song directory does not exist!")
+    song_path = (
+        convert_to_stereo(song_path, song_dir, progress_bar, percentages[0])
+        if stereofy
+        else song_path
+    )
+    arg_dict = {
+        "input-files": [
+            {"name": os.path.basename(song_path), "hash": get_file_hash(song_path)}
+        ],
+    }
+    vocals_path_base = _get_unique_base_path(
+        song_dir, "1_Vocals", arg_dict, progress_bar, percentages[1]
+    )
+    instrumentals_path_base = _get_unique_base_path(
+        song_dir, "1_Instrumental", arg_dict, progress_bar, percentages[1]
+    )
+    vocals_path = f"{vocals_path_base}.wav"
+    vocals_json_path = f"{vocals_path_base}.json"
+    instrumentals_path = f"{instrumentals_path_base}.wav"
+    instrumentals_json_path = f"{instrumentals_path_base}.json"
+    if not (
+        os.path.exists(vocals_path)
+        and os.path.exists(vocals_json_path)
+        and os.path.exists(instrumentals_path)
+        and os.path.exists(instrumentals_json_path)
+    ):
+        display_progress(
+            "[~] Separating vocals from instrumentals...", percentages[1], progress_bar
+        )
+        SEPARATOR.arch_specific_params["MDX"]["segment_size"] = 512
+        SEPARATOR.load_model("UVR-MDX-NET-Voc_FT.onnx")
+        temp_instrumentals_name, temp_vocals_name = SEPARATOR.separate(song_path)
+        shutil.move(
+            os.path.join(INTERMEDIATE_AUDIO_DIR, temp_instrumentals_name),
+            instrumentals_path,
+        )
+        shutil.move(os.path.join(INTERMEDIATE_AUDIO_DIR, temp_vocals_name), vocals_path)
+        json_dump(arg_dict, vocals_json_path)
+        json_dump(arg_dict, instrumentals_json_path)
+    return vocals_path, instrumentals_path
+def separate_main_vocals(
+    vocals_path: str,
+    song_dir: str,
+    stereofy: bool = True,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> tuple[str, str]:
+    """
+    Separate a vocals track into main vocals and backup vocals.
+    Parameters
+    ----------
+    vocals_path : str
+        The path to the vocals track to separate.
+    song_dir : str
+        The path to the directory where the separated main vocals
+        and backup vocals will be saved.
+    stereofy : bool, default=True
+        Whether to convert the vocals track to stereo
+        before separating its main vocals and backup vocals.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float,float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    main_vocals_path : str
+        The path to the separated main vocals.
+    backup_vocals_path : str
+        The path to the separated backup vocals.
+    Raises
+    ------
+    InputMissingError
+        If no vocals track path or song directory path is provided.
+    PathNotFoundError
+        If the provided vocals path or song directory path does not point
+        to an existing file or directory.
+    """
+    if not vocals_path:
+        raise InputMissingError("Vocals missing!")
+    if not os.path.isfile(vocals_path):
+        raise PathNotFoundError("Vocals do not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("song directory does not exist!")
+    vocals_path = (
+        convert_to_stereo(vocals_path, song_dir, progress_bar, percentages[0])
+        if stereofy
+        else vocals_path
+    )
+    arg_dict = {
+        "input-files": [
+            {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
+        ],
+    }
+    main_vocals_path_base = _get_unique_base_path(
+        song_dir, "2_Vocals_Main", arg_dict, progress_bar, percentages[1]
+    )
+    backup_vocals_path_base = _get_unique_base_path(
+        song_dir, "2_Vocals_Backup", arg_dict, progress_bar, percentages[1]
+    )
+    main_vocals_path = f"{main_vocals_path_base}.wav"
+    main_vocals_json_path = f"{main_vocals_path_base}.json"
+    backup_vocals_path = f"{backup_vocals_path_base}.wav"
+    backup_vocals_json_path = f"{backup_vocals_path_base}.json"
+    if not (
+        os.path.exists(main_vocals_path)
+        and os.path.exists(main_vocals_json_path)
+        and os.path.exists(backup_vocals_path)
+        and os.path.exists(backup_vocals_json_path)
+    ):
+        display_progress(
+            "[~] Separating main vocals from backup vocals...",
+            percentages[1],
+            progress_bar,
+        )
+        SEPARATOR.arch_specific_params["MDX"]["segment_size"] = 512
+        SEPARATOR.load_model("UVR_MDXNET_KARA_2.onnx")
+        temp_main_vocals_name, temp_backup_vocals_name = SEPARATOR.separate(vocals_path)
+        shutil.move(
+            os.path.join(INTERMEDIATE_AUDIO_DIR, temp_main_vocals_name),
+            main_vocals_path,
+        )
+        shutil.move(
+            os.path.join(INTERMEDIATE_AUDIO_DIR, temp_backup_vocals_name),
+            backup_vocals_path,
+        )
+        json_dump(arg_dict, main_vocals_json_path)
+        json_dump(arg_dict, backup_vocals_json_path)
+    return main_vocals_path, backup_vocals_path
+def dereverb_vocals(
+    vocals_path: str,
+    song_dir: str,
+    stereofy: bool = True,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> tuple[str, str]:
+    """
+    De-reverb a vocals track.
+    Parameters
+    ----------
+    vocals_path : str
+        The path to the vocals track to de-reverb.
+    song_dir : str
+        The path to the directory where the de-reverbed vocals will be saved.
+    stereofy : bool, default=True
+        Whether to convert the vocals track to stereo before de-reverbing it.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float,float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    vocals_dereverb_path : str
+        The path to the de-reverbed vocals.
+    vocals_reverb_path : str
+        The path to the reverb of the vocals.
+    Raises
+    ------
+    InputMissingError
+        If no vocals track path or song directory path is provided.
+    PathNotFoundError
+        If the provided vocals path or song directory path does not point
+        to an existing file or directory.
+    """
+    if not vocals_path:
+        raise InputMissingError("Vocals missing!")
+    if not os.path.isfile(vocals_path):
+        raise PathNotFoundError("Vocals do not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("song directory does not exist!")
+    vocals_path = (
+        convert_to_stereo(vocals_path, song_dir, progress_bar, percentages[0])
+        if stereofy
+        else vocals_path
+    )
+    arg_dict = {
+        "input-files": [
+            {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
+        ],
+    }
+    vocals_dereverb_path_base = _get_unique_base_path(
+        song_dir, "3_Vocals_DeReverb", arg_dict, progress_bar, percentages[1]
+    )
+    vocals_reverb_path_base = _get_unique_base_path(
+        song_dir, "3_Vocals_Reverb", arg_dict, progress_bar, percentages[1]
+    )
+    vocals_dereverb_path = f"{vocals_dereverb_path_base}.wav"
+    vocals_dereverb_json_path = f"{vocals_dereverb_path_base}.json"
+    vocals_reverb_path = f"{vocals_reverb_path_base}.wav"
+    vocals_reverb_json_path = f"{vocals_reverb_path_base}.json"
+    if not (
+        os.path.exists(vocals_dereverb_path)
+        and os.path.exists(vocals_dereverb_json_path)
+        and os.path.exists(vocals_reverb_path)
+        and os.path.exists(vocals_reverb_json_path)
+    ):
+        display_progress("[~] De-reverbing vocals...", percentages[1], progress_bar)
+        SEPARATOR.arch_specific_params["MDX"]["segment_size"] = 256
+        SEPARATOR.load_model("Reverb_HQ_By_FoxJoy.onnx")
+        temp_vocals_dereverb_name, temp_vocals_reverb_name = SEPARATOR.separate(
+            vocals_path
+        )
+        shutil.move(
+            os.path.join(INTERMEDIATE_AUDIO_DIR, temp_vocals_dereverb_name),
+            vocals_dereverb_path,
+        )
+        shutil.move(
+            os.path.join(INTERMEDIATE_AUDIO_DIR, temp_vocals_reverb_name),
+            vocals_reverb_path,
+        )
+        json_dump(arg_dict, vocals_dereverb_json_path)
+        json_dump(arg_dict, vocals_reverb_json_path)
+    return vocals_dereverb_path, vocals_reverb_path
+def convert_vocals(
+    vocals_path: str,
+    song_dir: str,
+    voice_model: str,
+    pitch_change_octaves: int = 0,
+    pitch_change_semi_tones: int = 0,
+    index_rate: float = 0.5,
+    filter_radius: int = 3,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33,
+    f0_method: F0Method = "rmvpe",
+    crepe_hop_length: int = 128,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Convert a vocals track using a voice model.
+    Parameters
+    ----------
+    vocals_path : str
+        The path to the vocals track to convert.
+    song_dir : str
+        The path to the directory where the converted vocals will be saved.
+    voice_model : str
+        The name of the voice model to use.
+    pitch_change_octaves : int, default=0
+        The number of octaves to pitch-shift the converted vocals by.
+    pitch_change_semi_tones : int, default=0
+        The number of semi-tones to pitch-shift the converted vocals by.
+    index_rate : float, default=0.5
+        The influence of the index file on the vocal conversion.
+    filter_radius : int, default=3
+        The filter radius to use for the vocal conversion.
+    rms_mix_rate : float, default=0.25
+        The blending rate of the volume envelope of the converted vocals.
+    protect : float, default=0.33
+        The protection rate for consonants and breathing sounds.
+    f0_method : F0Method, default="rmvpe"
+        The method to use for pitch extraction.
+    crepe_hop_length : int, default=128
+        The hop length to use for crepe-based pitch extraction.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        The path to the converted vocals.
+    Raises
+    ------
+    InputMissingError
+        If no vocals track path, song directory path or voice model name is provided.
+    PathNotFoundError
+        If the provided vocals path, song directory path or voice model name
+        does not point to an existing file or directory.
+    """
+    if not vocals_path:
+        raise InputMissingError("Vocals missing!")
+    if not os.path.isfile(vocals_path):
+        raise PathNotFoundError("Vocals do not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("song directory does not exist!")
+    if not voice_model:
+        raise InputMissingError("Voice model missing!")
+    if not os.path.isdir(os.path.join(RVC_MODELS_DIR, voice_model)):
+        raise PathNotFoundError("Voice model does not exist!")
+    pitch_change = pitch_change_octaves * 12 + pitch_change_semi_tones
+    hop_length_suffix = "" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"
+    arg_dict = {
+        "input-files": [
+            {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
+        ],
+        "voice-model": voice_model,
+        "pitch-shift": pitch_change,
+        "index-rate": index_rate,
+        "filter-radius": filter_radius,
+        "rms-mix-rate": rms_mix_rate,
+        "protect": protect,
+        "f0-method": f"{f0_method}{hop_length_suffix}",
+    }
+    converted_vocals_path_base = _get_unique_base_path(
+        song_dir, "4_Vocals_Converted", arg_dict, progress_bar, percentage
+    )
+    converted_vocals_path = f"{converted_vocals_path_base}.wav"
+    converted_vocals_json_path = f"{converted_vocals_path_base}.json"
+    if not (
+        os.path.exists(converted_vocals_path)
+        and os.path.exists(converted_vocals_json_path)
+    ):
+        display_progress("[~] Converting vocals using RVC...", percentage, progress_bar)
+        _convert_voice(
+            voice_model,
+            vocals_path,
+            converted_vocals_path,
+            pitch_change,
+            f0_method,
+            index_rate,
+            filter_radius,
+            rms_mix_rate,
+            protect,
+            crepe_hop_length,
+            44100,
+        )
+        json_dump(arg_dict, converted_vocals_json_path)
+    return converted_vocals_path
+def postprocess_vocals(
+    vocals_path: str,
+    song_dir: str,
+    reverb_rm_size: float = 0.15,
+    reverb_wet: float = 0.2,
+    reverb_dry: float = 0.8,
+    reverb_damping: float = 0.7,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Apply high-pass filter, compressor and reverb effects to a vocals track.
+    Parameters
+    ----------
+    vocals_path : str
+        The path to the vocals track to add effects to.
+    song_dir : str
+        The path to the directory where the effected vocals will be saved.
+    reverb_rm_size : float, default=0.15
+        The room size of the reverb effect.
+    reverb_wet : float, default=0.2
+        The wet level of the reverb effect.
+    reverb_dry : float, default=0.8
+        The dry level of the reverb effect.
+    reverb_damping : float, default=0.7
+        The damping of the reverb effect.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        The path to the effected vocals.
+    Raises
+    ------
+    InputMissingError
+        If no vocals track path or song directory path is provided.
+    PathNotFoundError
+        If the provided vocals path or song directory path does not point
+        to an existing file or directory.
+    """
+    if not vocals_path:
+        raise InputMissingError("Vocals missing!")
+    if not os.path.isfile(vocals_path):
+        raise PathNotFoundError("Vocals do not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("song directory does not exist!")
+    arg_dict = {
+        "input-files": [
+            {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
+        ],
+        "reverb-room-size": reverb_rm_size,
+        "reverb-wet": reverb_wet,
+        "reverb-dry": reverb_dry,
+        "reverb-damping": reverb_damping,
+    }
+    vocals_mixed_path_base = _get_unique_base_path(
+        song_dir, "5_Vocals_Postprocessed", arg_dict, progress_bar, percentage
+    )
+    vocals_mixed_path = f"{vocals_mixed_path_base}.wav"
+    vocals_mixed_json_path = f"{vocals_mixed_path_base}.json"
+    if not (
+        os.path.exists(vocals_mixed_path) and os.path.exists(vocals_mixed_json_path)
+    ):
+        display_progress(
+            "[~] Applying audio effects to vocals...", percentage, progress_bar
+        )
+        _add_audio_effects(
+            vocals_path,
+            vocals_mixed_path,
+            reverb_rm_size,
+            reverb_wet,
+            reverb_dry,
+            reverb_damping,
+        )
+        json_dump(arg_dict, vocals_mixed_json_path)
+    return vocals_mixed_path
+def pitch_shift_background(
+    instrumentals_path: str,
+    backup_vocals_path: str,
+    song_dir: str,
+    pitch_change: int = 0,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> tuple[str, str]:
+    """
+    Pitch shift instrumentals and backup vocals by a given number of semi-tones.
+    Parameters
+    ----------
+    instrumentals_path : str
+        The path to the instrumentals to pitch shift.
+    backup_vocals_path : str
+        The path to the backup vocals to pitch shift.
+    song_dir : str
+        The path to the directory where the pitch-shifted instrumentals
+        and backup vocals will be saved.
+    pitch_change : int, default=0
+        The number of semi-tones to pitch-shift the instrumentals
+        and backup vocals by.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float,float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    instrumentals_shifted_path : str
+        The path to the pitch-shifted instrumentals.
+    backup_vocals_shifted_path : str
+        The path to the pitch-shifted backup vocals.
+    Raises
+    ------
+    InputMissingError
+        If no instrumentals path, backup vocals path or song directory path is provided.
+    PathNotFoundError
+        If the provided instrumentals path, backup vocals path or song directory path
+        does not point to an existing file or directory.
+    """
+    if not instrumentals_path:
+        raise InputMissingError("Instrumentals missing!")
+    if not os.path.isfile(instrumentals_path):
+        raise PathNotFoundError("Instrumentals do not exist!")
+    if not backup_vocals_path:
+        raise InputMissingError("Backup vocals missing!")
+    if not os.path.isfile(backup_vocals_path):
+        raise PathNotFoundError("Backup vocals do not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("song directory does not exist!")
+    instrumentals_shifted_path = instrumentals_path
+    backup_vocals_shifted_path = backup_vocals_path
+    if pitch_change != 0:
+        instrumentals_dict = {
+            "input-files": [
+                {
+                    "name": os.path.basename(instrumentals_path),
+                    "hash": get_file_hash(instrumentals_path),
+                }
+            ],
+            "pitch-shift": pitch_change,
+        }
+        instrumentals_shifted_path_base = _get_unique_base_path(
+            song_dir,
+            "6_Instrumental_Shifted",
+            instrumentals_dict,
+            progress_bar,
+            percentages[0],
+        )
+        instrumentals_shifted_path = f"{instrumentals_shifted_path_base}.wav"
+        instrumentals_shifted_json_path = f"{instrumentals_shifted_path_base}.json"
+        if not (
+            os.path.exists(instrumentals_shifted_path)
+            and os.path.exists(instrumentals_shifted_json_path)
+        ):
+            display_progress(
+                "[~] Applying pitch shift to instrumentals",
+                percentages[0],
+                progress_bar,
+            )
+            _pitch_shift(instrumentals_path, instrumentals_shifted_path, pitch_change)
+            json_dump(instrumentals_dict, instrumentals_shifted_json_path)
+        backup_vocals_dict = {
+            "input-files": [
+                {
+                    "name": os.path.basename(backup_vocals_path),
+                    "hash": get_file_hash(backup_vocals_path),
+                }
+            ],
+            "pitch-shift": pitch_change,
+        }
+        backup_vocals_shifted_path_base = _get_unique_base_path(
+            song_dir,
+            "6_Vocals_Backup_Shifted",
+            backup_vocals_dict,
+            progress_bar,
+            percentages[1],
+        )
+        backup_vocals_shifted_path = f"{backup_vocals_shifted_path_base}.wav"
+        backup_vocals_shifted_json_path = f"{backup_vocals_shifted_path_base}.json"
+        if not (
+            os.path.exists(backup_vocals_shifted_path)
+            and os.path.exists(backup_vocals_shifted_json_path)
+        ):
+            display_progress(
+                "[~] Applying pitch shift to backup vocals",
+                percentages[1],
+                progress_bar,
+            )
+            _pitch_shift(backup_vocals_path, backup_vocals_shifted_path, pitch_change)
+            json_dump(backup_vocals_dict, backup_vocals_shifted_json_path)
+    return instrumentals_shifted_path, backup_vocals_shifted_path
+def _get_voice_model(
+    mixed_vocals_path: str | None = None, song_dir: str | None = None
+) -> str:
+    """
+    Infer the voice model used for vocal conversion from a
+    mixed vocals file in a given song directory.
+    If the voice model cannot be inferred, "Unknown" is returned.
+    Parameters
+    ----------
+    mixed_vocals_path : str, optional
+        The path to a mixed vocals file.
+    song_dir : str, optional
+        The path to a song directory.
+    Returns
+    -------
+    str
+        The voice model used for vocal conversion.
+    """
+    voice_model = "Unknown"
+    if not (mixed_vocals_path and song_dir):
+        return voice_model
+    mixed_vocals_stem = get_path_stem(mixed_vocals_path)
+    mixed_vocals_json_path = os.path.join(song_dir, f"{mixed_vocals_stem}.json")
+    if not os.path.isfile(mixed_vocals_json_path):
+        return voice_model
+    mixed_vocals_json_dict = json_load(mixed_vocals_json_path)
+    input_files = mixed_vocals_json_dict.get("input-files")
+    input_path = input_files[0].get("name") if input_files else None
+    if not input_path:
+        return voice_model
+    input_stem = get_path_stem(input_path)
+    converted_vocals_json_path = os.path.join(song_dir, f"{input_stem}.json")
+    if not os.path.isfile(converted_vocals_json_path):
+        return voice_model
+    converted_vocals_dict = json_load(converted_vocals_json_path)
+    return converted_vocals_dict.get("voice-model", voice_model)
+def get_song_cover_name(
+    mixed_vocals_path: str | None = None,
+    song_dir: str | None = None,
+    voice_model: str | None = None,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Generates a suitable name for a cover of a song based on that song's
+    original name and the voice model used for vocal conversion.
+    If the path of an existing song directory is provided, the original song
+    name is inferred from that directory. If a voice model is not provided but
+    the path of an existing song directory and the path of a mixed
+    vocals file in that directory are provided, then the voice model is
+    inferred from the mixed vocals file.
+    Parameters
+    ----------
+    mixed_vocals_path : str, optional
+        The path to a mixed vocals file.
+    song_dir : str, optional
+        The path to a song directory.
+    voice_model : str, optional
+        A voice model name.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        The song cover name
+    """
+    display_progress("[~] Getting song cover name...", percentage, progress_bar)
+    orig_song_path = _get_input_audio_path(song_dir) if song_dir else None
+    orig_song_name = (
+        (get_path_stem(orig_song_path).removeprefix("0_").removesuffix("_Original"))
+        if orig_song_path
+        else "Unknown"
+    )
+    voice_model = voice_model or _get_voice_model(mixed_vocals_path, song_dir)
+    return f"{orig_song_name} ({voice_model} Ver)"
+def mix_song_cover(
+    main_vocals_path: str,
+    instrumentals_path: str,
+    backup_vocals_path: str,
+    song_dir: str,
+    main_gain: int = 0,
+    inst_gain: int = 0,
+    backup_gain: int = 0,
+    output_sr: int = 44100,
+    output_format: InputAudioExt = "mp3",
+    output_name: str | None = None,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> str:
+    """
+    Mix main vocals, instrumentals, and backup vocals to create a song cover.
+    Parameters
+    ----------
+    main_vocals_path : str
+        The path to the main vocals to mix.
+    instrumentals_path : str
+        The path to the instrumentals to mix.
+    backup_vocals_path : str
+        The path to the backup vocals to mix.
+    song_dir : str
+        The path to the song directory where the song cover will be saved.
+    main_gain : int, default=0
+        The gain to apply to the main vocals.
+    inst_gain : int, default=0
+        The gain to apply to the instrumentals.
+    backup_gain : int, default=0
+        The gain to apply to the backup vocals.
+    output_sr : int, default=44100
+        The sample rate of the song cover.
+    output_format : InputAudioExt, default="mp3"
+        The audio format of the song cover.
+    output_name : str, optional
+        The name of the song cover.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float,float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    str
+        The path to the song cover.
+    Raises
+    ------
+    InputMissingError
+        If no main vocals, instrumentals, backup vocals or song directory path is provided.
+    PathNotFoundError
+        If the provided main vocals, instrumentals, backup vocals or song directory path
+        does not point to an existing file or directory.
+    """
+    if not main_vocals_path:
+        raise InputMissingError("Main vocals missing!")
+    if not os.path.isfile(main_vocals_path):
+        raise PathNotFoundError("Main vocals do not exist!")
+    if not instrumentals_path:
+        raise InputMissingError("Instrumentals missing!")
+    if not os.path.isfile(instrumentals_path):
+        raise PathNotFoundError("Instrumentals do not exist!")
+    if not backup_vocals_path:
+        raise InputMissingError("Backup vocals missing!")
+    if not os.path.isfile(backup_vocals_path):
+        raise PathNotFoundError("Backup vocals do not exist!")
+    if not song_dir:
+        raise InputMissingError("Song directory missing!")
+    if not os.path.isdir(song_dir):
+        raise PathNotFoundError("song directory does not exist!")
+    arg_dict = {
+        "input-files": [
+            {
+                "name": os.path.basename(main_vocals_path),
+                "hash": get_file_hash(main_vocals_path),
+            },
+            {
+                "name": os.path.basename(instrumentals_path),
+                "hash": get_file_hash(instrumentals_path),
+            },
+            {
+                "name": os.path.basename(backup_vocals_path),
+                "hash": get_file_hash(backup_vocals_path),
+            },
+        ],
+        "main-gain": main_gain,
+        "instrument-gain": inst_gain,
+        "backup-gain": backup_gain,
+        "sample-rate": output_sr,
+    }
+    mixdown_path_base = _get_unique_base_path(
+        song_dir, "7_Mixdown", arg_dict, progress_bar, percentages[0]
+    )
+    mixdown_path = f"{mixdown_path_base}.{output_format}"
+    mixdown_json_path = f"{mixdown_path_base}.json"
+    if not (os.path.exists(mixdown_path) and os.path.exists(mixdown_json_path)):
+        display_progress(
+            "[~] Mixing main vocals, instrumentals, and backup vocals...",
+            percentages[0],
+            progress_bar,
+        )
+        _mix_audio(
+            main_vocals_path,
+            backup_vocals_path,
+            instrumentals_path,
+            main_gain,
+            backup_gain,
+            inst_gain,
+            output_format,
+            output_sr,
+            mixdown_path,
+        )
+        json_dump(arg_dict, mixdown_json_path)
+    output_name = output_name or get_song_cover_name(
+        main_vocals_path, song_dir, None, progress_bar, percentages[1]
+    )
+    song_cover_path = os.path.join(OUTPUT_AUDIO_DIR, f"{output_name}.{output_format}")
+    os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
+    shutil.copyfile(mixdown_path, song_cover_path)
+    return song_cover_path
+def run_pipeline(
+    song_input: str,
+    voice_model: str,
+    pitch_change_vocals: int = 0,
+    pitch_change_all: int = 0,
+    index_rate: float = 0.5,
+    filter_radius: int = 3,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33,
+    f0_method: F0Method = "rmvpe",
+    crepe_hop_length: int = 128,
+    reverb_rm_size: float = 0.15,
+    reverb_wet: float = 0.2,
+    reverb_dry: float = 0.8,
+    reverb_damping: float = 0.7,
+    main_gain: int = 0,
+    inst_gain: int = 0,
+    backup_gain: int = 0,
+    output_sr: int = 44100,
+    output_format: InputAudioExt = "mp3",
+    output_name: str | None = None,
+    return_files: bool = False,
+    progress_bar: gr.Progress | None = None,
+) -> str | tuple[str, ...]:
+    """
+    Run the song cover generation pipeline.
+    Parameters
+    ----------
+    song_input : str
+        A Youtube URL, the path of a local audio file or the path of a song directory.
+    voice_model : str
+        The name of the voice model to use for vocal conversion.
+    pitch_change_vocals : int, default=0
+        The number of octaves to pitch-shift the converted vocals by.
+    pitch_change_all : int, default=0
+        The number of semi-tones to pitch-shift the converted vocals,
+        instrumentals, and backup vocals by.
+    index_rate : float, default=0.5
+        The influence of the index file on the vocal conversion.
+    filter_radius : int, default=3
+        The filter radius to use for the vocal conversion.
+    rms_mix_rate : float, default=0.25
+        The blending rate of the volume envelope of the converted vocals.
+    protect : float, default=0.33
+        The protection rate for consonants and breathing sounds in the vocal conversion.
+    f0_method : F0Method, default="rmvpe"
+        The method to use for pitch extraction in the vocal conversion.
+    crepe_hop_length : int, default=128
+        The hop length to use for crepe-based pitch extraction.
+    reverb_rm_size : float, default=0.15
+        The room size of the reverb effect to apply to the converted vocals.
+    reverb_wet : float, default=0.2
+        The wet level of the reverb effect to apply to the converted vocals.
+    reverb_dry : float, default=0.8
+        The dry level of the reverb effect to apply to the converted vocals.
+    reverb_damping : float, default=0.7
+        The damping of the reverb effect to apply to the converted vocals.
+    main_gain : int, default=0
+        The gain to apply to the post-processed vocals.
+    inst_gain : int, default=0
+        The gain to apply to the pitch-shifted instrumentals.
+    backup_gain : int, default=0
+        The gain to apply to the pitch-shifted backup vocals.
+    output_sr : int, default=44100
+        The sample rate of the song cover.
+    output_format : InputAudioExt, default="mp3"
+        The audio format of the song cover.
+    output_name : str, optional
+        The name of the song cover.
+    return_files : bool, default=False
+        Whether to return the paths of the generated intermediate audio files.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    Returns
+    -------
+    str | tuple[str,...]
+        The path to the generated song cover and, if `return_files=True`,
+        also the paths of any generated intermediate audio files.
+    """
+    if not song_input:
+        raise InputMissingError(
+            "Song input missing! Please provide a valid YouTube url, local audio file"
+            " path or cached song directory path."
+        )
+    if not voice_model:
+        raise InputMissingError("Voice model missing!")
+    if not os.path.isdir(os.path.join(RVC_MODELS_DIR, voice_model)):
+        raise PathNotFoundError("Voice model does not exist!")
+    display_progress("[~] Starting song cover generation pipeline...", 0, progress_bar)
+    orig_song_path, song_dir = retrieve_song(
+        song_input, progress_bar, (0 / 15, 1 / 15, 2 / 15)
+    )
+    vocals_path, instrumentals_path = separate_vocals(
+        orig_song_path, song_dir, False, progress_bar, (3 / 15, 4 / 15)
+    )
+    main_vocals_path, backup_vocals_path = separate_main_vocals(
+        vocals_path, song_dir, False, progress_bar, (5 / 15, 6 / 15)
+    )
+    vocals_dereverb_path, reverb_path = dereverb_vocals(
+        main_vocals_path, song_dir, False, progress_bar, (7 / 15, 8 / 15)
+    )
+    converted_vocals_path = convert_vocals(
+        vocals_dereverb_path,
+        song_dir,
+        voice_model,
+        pitch_change_vocals,
+        pitch_change_all,
+        index_rate,
+        filter_radius,
+        rms_mix_rate,
+        protect,
+        f0_method,
+        crepe_hop_length,
+        progress_bar,
+        9 / 15,
+    )
+    vocals_mixed_path = postprocess_vocals(
+        converted_vocals_path,
+        song_dir,
+        reverb_rm_size,
+        reverb_wet,
+        reverb_dry,
+        reverb_damping,
+        progress_bar,
+        10 / 15,
+    )
+    instrumentals_shifted_path, backup_vocals_shifted_path = pitch_shift_background(
+        instrumentals_path,
+        backup_vocals_path,
+        song_dir,
+        pitch_change_all,
+        progress_bar,
+        (11 / 15, 12 / 15),
+    )
+    song_cover_path = mix_song_cover(
+        vocals_mixed_path,
+        instrumentals_shifted_path or instrumentals_path,
+        backup_vocals_shifted_path or backup_vocals_path,
+        song_dir,
+        main_gain,
+        inst_gain,
+        backup_gain,
+        output_sr,
+        output_format,
+        output_name,
+        progress_bar,
+        (13 / 15, 14 / 15),
+    )
+    if return_files:
+        return (
+            orig_song_path,
+            vocals_path,
+            instrumentals_path,
+            main_vocals_path,
+            backup_vocals_path,
+            vocals_dereverb_path,
+            reverb_path,
+            converted_vocals_path,
+            vocals_mixed_path,
+            instrumentals_shifted_path,
+            backup_vocals_shifted_path,
+            song_cover_path,
+        )
+    else:
+        return song_cover_path

src/backend/manage_audio.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""
+This module contains functions to manage audio files.
+"""
+import os
+import shutil
+from pathlib import PurePath
+import gradio as gr
+from backend.common import INTERMEDIATE_AUDIO_DIR, OUTPUT_AUDIO_DIR, display_progress
+from backend.exceptions import InputMissingError, InvalidPathError, PathNotFoundError
+from common import GRADIO_TEMP_DIR
+def get_output_audio() -> list[tuple[str, str]]:
+    """
+    Get the name and path of all output audio files.
+    Returns
+    -------
+    list[tuple[str, str]]
+        A list of tuples containing the name and path of each output audio file.
+    """
+    if os.path.isdir(OUTPUT_AUDIO_DIR):
+        named_output_files = [
+            (file_name, os.path.join(OUTPUT_AUDIO_DIR, file_name))
+            for file_name in os.listdir(OUTPUT_AUDIO_DIR)
+        ]
+        return sorted(named_output_files, key=lambda x: x[0])
+    return []
+def delete_intermediate_audio(
+    song_dirs: list[str],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Delete intermediate audio files in provided song directories.
+    Parameters
+    ----------
+    song_dirs : list[str]
+        Paths of song directories to delete intermediate audio files for.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    Raises
+    ------
+    InputMissingError
+        If no song directories are provided.
+    PathNotFoundError
+        If a song directory does not exist.
+    InvalidPathError
+        If a song directory is not located in the root of the intermediate audio directory.
+    """
+    if not song_dirs:
+        raise InputMissingError(
+            "Song directories missing! Please provide a non-empty list of song"
+            " directories."
+        )
+    display_progress(
+        "[~] Deleting intermediate audio files for selected songs...",
+        percentage,
+        progress_bar,
+    )
+    for song_dir in song_dirs:
+        if not os.path.isdir(song_dir):
+            raise PathNotFoundError(f"Song directory '{song_dir}' does not exist.")
+        if not PurePath(song_dir).parent == PurePath(INTERMEDIATE_AUDIO_DIR):
+            raise InvalidPathError(
+                f"Song directory '{song_dir}' is not located in the root of the"
+                " intermediate audio directory."
+            )
+        shutil.rmtree(song_dir)
+    return "[+] Successfully deleted intermediate audio files for selected songs!"
+def delete_all_intermediate_audio(
+    progress_bar: gr.Progress | None = None, percentage: float = 0.0
+) -> str:
+    """
+    Delete all intermediate audio files.
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+    Returns
+    -------
+    str
+        Success message.
+    """
+    display_progress(
+        "[~] Deleting all intermediate audio files...", percentage, progress_bar
+    )
+    if os.path.isdir(INTERMEDIATE_AUDIO_DIR):
+        shutil.rmtree(INTERMEDIATE_AUDIO_DIR)
+    return "[+] All intermediate audio files successfully deleted!"
+def delete_output_audio(
+    output_audio_files: list[str],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Delete selected output audio files.
+    Parameters
+    ----------
+    output_audio_files : list[str]
+        Paths of output audio files to delete.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    Raises
+    ------
+    InputMissingError
+        If no output audio files are provided.
+    PathNotFoundError
+        If an output audio file does not exist.
+    InvalidPathError
+        If an output audio file is not located in the root of the output audio directory.
+    """
+    if not output_audio_files:
+        raise InputMissingError(
+            "Output audio files missing! Please provide a non-empty list of output"
+            " audio files."
+        )
+    display_progress(
+        "[~] Deleting selected output audio files...", percentage, progress_bar
+    )
+    for output_audio_file in output_audio_files:
+        if not os.path.isfile(output_audio_file):
+            raise PathNotFoundError(
+                f"Output audio file '{output_audio_file}' does not exist."
+            )
+        if not PurePath(output_audio_file).parent == PurePath(OUTPUT_AUDIO_DIR):
+            raise InvalidPathError(
+                f"Output audio file '{output_audio_file}' is not located in the root of"
+                " the output audio directory."
+            )
+        os.remove(output_audio_file)
+    return "[+] Successfully deleted selected output audio files!"
+def delete_all_output_audio(
+    progress_bar: gr.Progress | None = None, percentage: float = 0.0
+) -> str:
+    """
+    Delete all output audio files.
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    """
+    display_progress("[~] Deleting all output audio files...", percentage, progress_bar)
+    if os.path.isdir(OUTPUT_AUDIO_DIR):
+        shutil.rmtree(OUTPUT_AUDIO_DIR)
+    return "[+] All output audio files successfully deleted!"
+def delete_all_audio(
+    progress_bar: gr.Progress | None = None, percentage: float = 0.0
+) -> str:
+    """
+    Delete all audio files.
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    """
+    display_progress("[~] Deleting all audio files...", percentage, progress_bar)
+    if os.path.isdir(INTERMEDIATE_AUDIO_DIR):
+        shutil.rmtree(INTERMEDIATE_AUDIO_DIR)
+    if os.path.isdir(OUTPUT_AUDIO_DIR):
+        shutil.rmtree(OUTPUT_AUDIO_DIR)
+    return "[+] All audio files successfully deleted!"
+def delete_gradio_temp_dir() -> None:
+    """
+    Delete the directory where Gradio stores temporary files.
+    """
+    if os.path.isdir(GRADIO_TEMP_DIR):
+        shutil.rmtree(GRADIO_TEMP_DIR)

src/backend/manage_voice_models.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""
+This module contains functions to manage voice models.
+"""
+from typings.extra import ModelsTable, ModelsTablePredicate
+import os
+import re
+import shutil
+import urllib.request
+import zipfile
+import gradio as gr
+from backend.common import copy_files_to_new_folder, display_progress, json_load
+from backend.exceptions import (
+    FileTypeError,
+    InputMissingError,
+    PathExistsError,
+    PathNotFoundError,
+)
+from common import RVC_MODELS_DIR
+PUBLIC_MODELS = json_load(os.path.join(RVC_MODELS_DIR, "public_models.json"))
+def get_current_models() -> list[str]:
+    """
+    Get the names of all saved voice models.
+    Returns
+    -------
+    list[str]
+        A list of names of all saved voice models.
+    """
+    models_list = os.listdir(RVC_MODELS_DIR)
+    items_to_remove = ["hubert_base.pt", "MODELS.txt", "public_models.json", "rmvpe.pt"]
+    return [item for item in models_list if item not in items_to_remove]
+def load_public_models_table(
+    predicates: list[ModelsTablePredicate],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> ModelsTable:
+    """
+    Load the public models table and filter it by the given predicates.
+    Parameters
+    ----------
+    predicates : list[ModelsTablePredicate]
+        List of predicates to filter the models table by.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    ModelsTable
+        The public models table, filtered by the given predicates.
+    """
+    models_table: ModelsTable = []
+    keys = ["name", "description", "tags", "credit", "added", "url"]
+    display_progress("[~] Loading public models table ...", percentage, progress_bar)
+    for model in PUBLIC_MODELS["voice_models"]:
+        if all([predicate(model) for predicate in predicates]):
+            models_table.append([model[key] for key in keys])
+    return models_table
+def load_public_model_tags() -> list[str]:
+    """
+    Load the tags of all public voice models.
+    Returns
+    -------
+    list[str]
+        A list of all tags of public voice models.
+    """
+    return list(PUBLIC_MODELS["tags"].keys())
+def filter_public_models_table(
+    tags: list[str],
+    query: str,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> ModelsTable:
+    """
+    Filter the public models table by a set of tags and a search query.
+    The search query is matched against the name, description, tags, credit,
+    and added date of each model in the public models table.
+    Case insensitive search is performed.
+    If the search query is empty, the models table is filtered only by the tags.
+    Parameters
+    ----------
+    tags : list[str]
+        List of tags to filter the models table by.
+    query : str
+        Search query to filter the models table by.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    ModelsTable
+        The public models table, filtered by the given tags and the given query.
+    """
+    tags_predicate: ModelsTablePredicate = lambda model: all(
+        tag in model["tags"] for tag in tags
+    )
+    query_predicate: ModelsTablePredicate = lambda model: (
+        query.lower()
+        in f"{model['name']} {model['description']} {' '.join(model['tags'])} {model['credit']} {model['added']}"
+        .lower()
+        if query
+        else True
+    )
+    filter_fns = [tags_predicate, query_predicate]
+    return load_public_models_table(filter_fns, progress_bar, percentage)
+def _extract_model_zip(extraction_folder: str, zip_name: str, remove_zip: bool) -> None:
+    """
+    Extract a voice model zip file to a directory.
+    Parameters
+    ----------
+    extraction_folder : str
+        The directory to extract the voice model to.
+    zip_name : str
+        The name of the zip file to extract.
+    remove_zip : bool
+        Whether to remove the zip file after extraction.
+    Raises
+    ------
+    PathNotFoundError
+        If no .pth model file is found in the extracted zip folder.
+    """
+    try:
+        os.makedirs(extraction_folder)
+        with zipfile.ZipFile(zip_name, "r") as zip_ref:
+            zip_ref.extractall(extraction_folder)
+        index_filepath, model_filepath = None, None
+        for root, _, files in os.walk(extraction_folder):
+            for name in files:
+                if (
+                    name.endswith(".index")
+                    and os.stat(os.path.join(root, name)).st_size > 1024 * 100
+                ):
+                    index_filepath = os.path.join(root, name)
+                if (
+                    name.endswith(".pth")
+                    and os.stat(os.path.join(root, name)).st_size > 1024 * 1024 * 40
+                ):
+                    model_filepath = os.path.join(root, name)
+        if not model_filepath:
+            raise PathNotFoundError(
+                "No .pth model file was found in the extracted zip folder."
+            )
+        # move model and index file to extraction folder
+        os.rename(
+            model_filepath,
+            os.path.join(extraction_folder, os.path.basename(model_filepath)),
+        )
+        if index_filepath:
+            os.rename(
+                index_filepath,
+                os.path.join(extraction_folder, os.path.basename(index_filepath)),
+            )
+        # remove any unnecessary nested folders
+        for filepath in os.listdir(extraction_folder):
+            if os.path.isdir(os.path.join(extraction_folder, filepath)):
+                shutil.rmtree(os.path.join(extraction_folder, filepath))
+    except Exception as e:
+        if os.path.isdir(extraction_folder):
+            shutil.rmtree(extraction_folder)
+        raise e
+    finally:
+        if remove_zip and os.path.exists(zip_name):
+            os.remove(zip_name)
+def download_online_model(
+    url: str,
+    dir_name: str,
+    progress_bar: gr.Progress | None = None,
+    percentages: tuple[float, float] = (0.0, 0.5),
+) -> str:
+    """
+    Download a voice model from a given URL and extract it to a directory.
+    Parameters
+    ----------
+    url : str
+        The URL of the voice model to download.
+    dir_name : str
+        The name of the directory to extract the voice model to.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentages : tuple[float, float], default=(0.0, 0.5)
+        Percentages to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    Raises
+    ------
+    InputMissingError
+        If an URL or a voice model directory name is not given.
+    PathExistsError
+        If the voice model directory already exists.
+    """
+    if not url:
+        raise InputMissingError("Download link to model missing!")
+    if not dir_name:
+        raise InputMissingError("Model name missing!")
+    extraction_folder = os.path.join(RVC_MODELS_DIR, dir_name)
+    if os.path.exists(extraction_folder):
+        raise PathExistsError(
+            f'Voice model directory "{dir_name}" already exists! Choose a different'
+            " name for your voice model."
+        )
+    zip_name = url.split("/")[-1].split("?")[0]
+    # NOTE in case huggingface link is a direct link rather
+    # than a resolve link then convert it to a resolve link
+    url = re.sub(
+        r"https://huggingface.co/([^/]+)/([^/]+)/blob/(.*)",
+        r"https://huggingface.co/\1/\2/resolve/\3",
+        url,
+    )
+    if "pixeldrain.com" in url:
+        url = f"https://pixeldrain.com/api/file/{zip_name}"
+    display_progress(
+        f"[~] Downloading voice model with name '{dir_name}'...",
+        percentages[0],
+        progress_bar,
+    )
+    urllib.request.urlretrieve(url, zip_name)
+    display_progress("[~] Extracting zip file...", percentages[1], progress_bar)
+    _extract_model_zip(extraction_folder, zip_name, remove_zip=True)
+    return f"[+] Model with name '{dir_name}' successfully downloaded!"
+def upload_local_model(
+    input_paths: list[str],
+    dir_name: str,
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Upload a voice model from either a local zip file or a local .pth file
+    and an optional index file.
+    Parameters
+    ----------
+    input_paths : list[str]
+        Paths of the local files to upload.
+    dir_name : str
+        The name of the directory to save the voice model files in.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    Raises
+    ------
+    InputMissingError
+        If no file paths or no voice model directory name is given.
+    ValueError
+        If more than two file paths are given.
+    PathExistsError
+        If a voice model directory by the given name already exists.
+    FileTypeError
+        If a single uploaded file is not a .pth file or a .zip file.
+        If two uploaded files are not a .pth file and an .index file.
+    """
+    if not input_paths:
+        raise InputMissingError("No files selected!")
+    if len(input_paths) > 2:
+        raise ValueError("At most two files can be uploaded!")
+    if not dir_name:
+        raise InputMissingError("Model name missing!")
+    output_folder = os.path.join(RVC_MODELS_DIR, dir_name)
+    if os.path.exists(output_folder):
+        raise PathExistsError(
+            f'Voice model directory "{dir_name}" already exists! Choose a different'
+            " name for your voice model."
+        )
+    if len(input_paths) == 1:
+        input_path = input_paths[0]
+        if os.path.splitext(input_path)[1] == ".pth":
+            display_progress("[~] Copying .pth file ...", percentage, progress_bar)
+            copy_files_to_new_folder(input_paths, output_folder)
+        # NOTE a .pth file is actually itself a zip file
+        elif zipfile.is_zipfile(input_path):
+            display_progress("[~] Extracting zip file...", percentage, progress_bar)
+            _extract_model_zip(output_folder, input_path, remove_zip=False)
+        else:
+            raise FileTypeError(
+                "Only a .pth file or a .zip file can be uploaded by itself!"
+            )
+    else:
+        # sort two input files by extension type
+        input_names_sorted = sorted(input_paths, key=lambda f: os.path.splitext(f)[1])
+        index_name, pth_name = input_names_sorted
+        if (
+            os.path.splitext(pth_name)[1] == ".pth"
+            and os.path.splitext(index_name)[1] == ".index"
+        ):
+            display_progress(
+                "[~] Copying .pth file and index file ...", percentage, progress_bar
+            )
+            copy_files_to_new_folder(input_paths, output_folder)
+        else:
+            raise FileTypeError(
+                "Only a .pth file and an .index file can be uploaded together!"
+            )
+    return f"[+] Model with name '{dir_name}' successfully uploaded!"
+def delete_models(
+    model_names: list[str],
+    progress_bar: gr.Progress | None = None,
+    percentage: float = 0.0,
+) -> str:
+    """
+    Delete one or more voice models.
+    Parameters
+    ----------
+    model_names : list[str]
+        Names of the models to delete.
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    Raises
+    ------
+    InputMissingError
+        If no model names are given.
+    PathNotFoundError
+        If a voice model directory does not exist.
+    """
+    if not model_names:
+        raise InputMissingError("No models selected!")
+    display_progress("[~] Deleting selected models ...", percentage, progress_bar)
+    for model_name in model_names:
+        model_dir = os.path.join(RVC_MODELS_DIR, model_name)
+        if not os.path.isdir(model_dir):
+            raise PathNotFoundError(
+                f'Voice model directory "{model_name}" does not exist!'
+            )
+        shutil.rmtree(model_dir)
+    models_names_formatted = [f"'{w}'" for w in model_names]
+    if len(model_names) == 1:
+        return f"[+] Model with name {models_names_formatted[0]} successfully deleted!"
+    else:
+        first_models = ", ".join(models_names_formatted[:-1])
+        last_model = models_names_formatted[-1]
+        return (
+            f"[+] Models with names {first_models} and {last_model} successfully"
+            " deleted!"
+        )
+def delete_all_models(
+    progress_bar: gr.Progress | None = None, percentage: float = 0.0
+) -> str:
+    """
+    Delete all voice models.
+    Parameters
+    ----------
+    progress_bar : gr.Progress, optional
+        Gradio progress bar to update.
+    percentage : float, default=0.0
+        Percentage to display in the progress bar.
+    Returns
+    -------
+    str
+        Success message.
+    """
+    all_models = get_current_models()
+    display_progress("[~] Deleting all models ...", percentage, progress_bar)
+    for model_name in all_models:
+        model_dir = os.path.join(RVC_MODELS_DIR, model_name)
+        if os.path.isdir(model_dir):
+            shutil.rmtree(model_dir)
+    return "[+] All models successfully deleted!"

src/cli.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import argparse
+from backend.generate_song_cover import run_pipeline
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate a cover song in the song_output/id directory.",
+        add_help=True,
+    )
+    parser.add_argument(
+        "-i",
+        "--song-input",
+        type=str,
+        required=True,
+        help=(
+            "Link to a song on YouTube, the full path of a local audio file or a cached"
+            " input song"
+        ),
+    )
+    parser.add_argument(
+        "-dir",
+        "--rvc-dirname",
+        type=str,
+        required=True,
+        help=(
+            "Name of the folder in the models/rvc directory containing the RVC model"
+            " file and optional index file to use"
+        ),
+    )
+    parser.add_argument(
+        "-pv",
+        "--pitch-change-vocals",
+        type=int,
+        required=True,
+        help=(
+            "Shift the pitch of converted vocals only. Measured in octaves. Generally,"
+            " use 1 for male to female and -1 for vice-versa."
+        ),
+    )
+    parser.add_argument(
+        "-pall",
+        "--pitch-change-all",
+        type=int,
+        default=0,
+        help=(
+            "Shift pitch of converted vocals, backup vocals and instrumentals. Measured"
+            " in semi-tones. Altering this slightly reduces sound quality"
+        ),
+    )
+    parser.add_argument(
+        "-ir",
+        "--index-rate",
+        type=float,
+        default=0.5,
+        help=(
+            "A decimal number e.g. 0.5, used to reduce/resolve the timbre leakage"
+            " problem. If set to 1, more biased towards the timbre quality of the"
+            " training dataset"
+        ),
+    )
+    parser.add_argument(
+        "-fr",
+        "--filter-radius",
+        type=int,
+        default=3,
+        help=(
+            "A number between 0 and 7. If >=3: apply median filtering to the harvested"
+            " pitch results. The value represents the filter radius and can reduce"
+            " breathiness."
+        ),
+    )
+    parser.add_argument(
+        "-rms",
+        "--rms-mix-rate",
+        type=float,
+        default=0.25,
+        help=(
+            "A decimal number e.g. 0.25. Control how much to use the loudness of the"
+            " input vocals (0) or a fixed loudness (1)."
+        ),
+    )
+    parser.add_argument(
+        "-pro",
+        "--protect",
+        type=float,
+        default=0.33,
+        help=(
+            "A decimal number e.g. 0.33. Protect voiceless consonants and breath sounds"
+            " to prevent artifacts such as tearing in electronic music. Set to 0.5 to"
+            " disable. Decrease the value to increase protection, but it may reduce"
+            " indexing accuracy."
+        ),
+    )
+    parser.add_argument(
+        "-palgo",
+        "--pitch-detection-algo",
+        type=str,
+        default="rmvpe",
+        help=(
+            "Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother"
+            " vocals)."
+        ),
+    )
+    parser.add_argument(
+        "-hop",
+        "--crepe-hop-length",
+        type=int,
+        default=128,
+        help=(
+            "If pitch detection algo is mangio-crepe, controls how often it checks for"
+            " pitch changes in milliseconds. The higher the value, the faster the"
+            " conversion and less risk of voice cracks, but there is less pitch"
+            " accuracy. Recommended: 128."
+        ),
+    )
+    parser.add_argument(
+        "-rsize",
+        "--reverb-size",
+        type=float,
+        default=0.15,
+        help="Reverb room size between 0 and 1",
+    )
+    parser.add_argument(
+        "-rwet",
+        "--reverb-wetness",
+        type=float,
+        default=0.2,
+        help="Reverb wet level between 0 and 1",
+    )
+    parser.add_argument(
+        "-rdry",
+        "--reverb-dryness",
+        type=float,
+        default=0.8,
+        help="Reverb dry level between 0 and 1",
+    )
+    parser.add_argument(
+        "-rdamp",
+        "--reverb-damping",
+        type=float,
+        default=0.7,
+        help="Reverb damping between 0 and 1",
+    )
+    parser.add_argument(
+        "-mv",
+        "--main-vol",
+        type=int,
+        default=0,
+        help=(
+            "Volume change for converted main vocals. Measured in dB. Use -3 to"
+            " decrease by 3 dB and 3 to increase by 3 dB"
+        ),
+    )
+    parser.add_argument(
+        "-bv",
+        "--backup-vol",
+        type=int,
+        default=0,
+        help="Volume change for backup vocals. Measured in dB",
+    )
+    parser.add_argument(
+        "-iv",
+        "--inst-vol",
+        type=int,
+        default=0,
+        help="Volume change for instrumentals. Measured in dB",
+    )
+    parser.add_argument(
+        "-osr",
+        "--output-sr",
+        type=int,
+        default=44100,
+        help="Sample rate of output audio file.",
+    )
+    parser.add_argument(
+        "-oformat",
+        "--output-format",
+        type=str,
+        default="mp3",
+        help="format of output audio file",
+    )
+    parser.add_argument(
+        "-k",
+        "--keep-files",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Whether to keep song directory with intermediate audio files generated"
+            " during song cover generation."
+        ),
+    )
+    args = parser.parse_args()
+    rvc_dirname = args.rvc_dirname
+    song_cover_path = run_pipeline(
+        song_input=args.song_input,
+        voice_model=rvc_dirname,
+        pitch_change_vocals=args.pitch_change_vocals,
+        pitch_change_all=args.pitch_change_all,
+        index_rate=args.index_rate,
+        filter_radius=args.filter_radius,
+        rms_mix_rate=args.rms_mix_rate,
+        protect=args.protect,
+        f0_method=args.pitch_detection_algo,
+        crepe_hop_length=args.crepe_hop_length,
+        reverb_rm_size=args.reverb_size,
+        reverb_wet=args.reverb_wetness,
+        reverb_dry=args.reverb_dryness,
+        reverb_damping=args.reverb_damping,
+        main_gain=args.main_vol,
+        backup_gain=args.backup_vol,
+        inst_gain=args.inst_vol,
+        output_sr=args.output_sr,
+        output_format=args.output_format,
+        return_files=False,
+        progress_bar=None,
+    )
+    print(f"[+] Cover generated at {song_cover_path}")

src/common.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Common variables used in the Ultimate-RVC project."""
+import os
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MODELS_DIR = os.path.join(BASE_DIR, "models")
+RVC_MODELS_DIR = os.path.join(MODELS_DIR, "rvc")
+SEPARATOR_MODELS_DIR = os.path.join(MODELS_DIR, "audio_separator")
+AUDIO_DIR = os.path.join(BASE_DIR, "audio")
+GRADIO_TEMP_DIR = os.path.join(AUDIO_DIR, "gradio_temp")

src/frontend/common.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""
+Module containing common utility functions and classes for the frontend.
+"""
+from typing import Any, Callable, Concatenate, Literal, Sequence
+from typings.extra import (
+    ComponentVisibilityKwArgs,
+    DropdownChoices,
+    DropdownValue,
+    F0Method,
+    P,
+    T,
+    TextBoxArgs,
+    UpdateDropdownArgs,
+)
+from dataclasses import dataclass
+from functools import partial
+import gradio as gr
+from gradio.components.base import Component
+from gradio.events import Dependency
+from backend.generate_song_cover import get_named_song_dirs, get_song_cover_name
+from backend.manage_audio import get_output_audio
+PROGRESS_BAR = gr.Progress()
+def exception_harness(fun: Callable[P, T]) -> Callable[P, T]:
+    """
+    Wrap a function in a harness that catches exceptions
+    and re-raises them as instances of `gradio.Error`.
+    Parameters
+    ----------
+    fun : Callable[P, T]
+        The function to wrap.
+    Returns
+    -------
+    Callable[P, T]
+        The wrapped function.
+    """
+    def _wrapped_fun(*args: P.args, **kwargs: P.kwargs) -> T:
+        try:
+            return fun(*args, **kwargs)
+        except Exception as e:
+            raise gr.Error(str(e))
+    return _wrapped_fun
+def confirmation_harness(fun: Callable[P, T]) -> Callable[Concatenate[bool, P], T]:
+    """
+    Wrap a function in a harness that requires a confirmation
+    before executing and catches exceptions,
+    re-raising them as instances of `gradio.Error`.
+    Parameters
+    ----------
+    fun : Callable[P, T]
+        The function to wrap.
+    Returns
+    -------
+    Callable[Concatenate[bool, P], T]
+        The wrapped function.
+    """
+    def _wrapped_fun(confirm: bool, *args: P.args, **kwargs: P.kwargs) -> T:
+        if confirm:
+            return exception_harness(fun)(*args, **kwargs)
+        else:
+            raise gr.Error("Confirmation missing!")
+    return _wrapped_fun
+def confirm_box_js(msg: str) -> str:
+    """
+    Generate JavaScript code for a confirmation box.
+    Parameters
+    ----------
+    msg : str
+        Message to display in the confirmation box.
+    Returns
+    -------
+    str
+        JavaScript code for the confirmation box.
+    """
+    formatted_msg = f"'{msg}'"
+    return f"(x) => confirm({formatted_msg})"
+def identity(x: T) -> T:
+    """
+    Identity function.
+    Parameters
+    ----------
+    x : T
+        Value to return.
+    Returns
+    -------
+    T
+        The value.
+    """
+    return x
+def update_value(x: Any) -> dict[str, Any]:
+    """
+    Update the value of a component.
+    Parameters
+    ----------
+    x : Any
+        New value for the component.
+    Returns
+    -------
+    dict[str, Any]
+        Dictionary which updates the value of the component.
+    """
+    return gr.update(value=x)
+def update_dropdowns(
+    fn: Callable[P, DropdownChoices],
+    num_components: int,
+    value: DropdownValue = None,
+    value_indices: Sequence[int] = [],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Update the choices and optionally the value of one or more dropdown components.
+    Parameters
+    ----------
+    fn : Callable[P, DropdownChoices]
+        Function to get updated choices for the dropdown components.
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+    args : P.args
+        Positional arguments to pass to the function used to update choices.
+    kwargs : P.kwargs
+        Keyword arguments to pass to the function used to update choices.
+    Returns
+    -------
+    gr.Dropdown|tuple[gr.Dropdown,...]
+        Updated dropdown component or components.
+    Raises
+    ------
+    ValueError
+        If value indices are not unique or if an index exceeds the number of components.
+    """
+    if len(value_indices) != len(set(value_indices)):
+        raise ValueError("Value indices must be unique.")
+    if value_indices and max(value_indices) >= num_components:
+        raise ValueError(
+            "Index of a component to update value for exceeds number of components."
+        )
+    updated_choices = fn(*args, **kwargs)
+    update_args: list[UpdateDropdownArgs] = [
+        {"choices": updated_choices} for _ in range(num_components)
+    ]
+    for index in value_indices:
+        update_args[index]["value"] = value
+    if len(update_args) == 1:
+        # NOTE This is a workaround as gradio does not support
+        # singleton tuples for components.
+        return gr.Dropdown(**update_args[0])
+    return tuple(gr.Dropdown(**update_arg) for update_arg in update_args)
+def update_cached_input_songs(
+    num_components: int, value: DropdownValue = None, value_indices: Sequence[int] = []
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Updates the choices of one or more dropdown components
+    to the current set of cached input songs.
+    Optionally updates the default value of one or more of these components.
+    Parameters
+    ----------
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+    Returns
+    -------
+    gr.Dropdown|tuple[gr.Dropdown,...]
+        Updated dropdown component or components.
+    """
+    return update_dropdowns(get_named_song_dirs, num_components, value, value_indices)
+def update_output_audio(
+    num_components: int, value: DropdownValue = None, value_indices: Sequence[int] = []
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Updates the choices of one or more dropdown
+    components to the current set of output audio files.
+    Optionally updates the default value of one or more of these components.
+    Parameters
+    ----------
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : Sequence[int], default=[]
+        Indices of dropdown components to update the value for.
+    Returns
+    -------
+    gr.Dropdown|tuple[gr.Dropdown,...]
+        Updated dropdown component or components.
+    """
+    return update_dropdowns(get_output_audio, num_components, value, value_indices)
+def toggle_visible_component(
+    num_components: int, visible_index: int
+) -> dict[str, Any] | tuple[dict[str, Any], ...]:
+    """
+    Reveal a single component from a set of components.
+    All other components are hidden.
+    Parameters
+    ----------
+    num_components : int
+        Number of components to set visibility for.
+    visible_index : int
+        Index of the component to reveal.
+    Returns
+    -------
+    dict|tuple[dict,...]
+        A single dictionary or a tuple of dictionaries
+        that update the visibility of the components.
+    """
+    if visible_index >= num_components:
+        raise ValueError("Visible index must be less than number of components.")
+    update_args: list[ComponentVisibilityKwArgs] = [
+        {"visible": False, "value": None} for _ in range(num_components)
+    ]
+    update_args[visible_index]["visible"] = True
+    if num_components == 1:
+        return gr.update(**update_args[0])
+    return tuple(gr.update(**update_arg) for update_arg in update_args)
+def _toggle_component_interactivity(
+    num_components: int, interactive: bool
+) -> dict[str, Any] | tuple[dict[str, Any], ...]:
+    """
+    Toggle interactivity of one or more components.
+    Parameters
+    ----------
+    num_components : int
+        Number of components to toggle interactivity for.
+    interactive : bool
+        Whether to make the components interactive or not.
+    Returns
+    -------
+    dict|tuple[dict,...]
+        A single dictionary or a tuple of dictionaries
+        that update the interactivity of the components.
+    """
+    if num_components == 1:
+        return gr.update(interactive=interactive)
+    return tuple(gr.update(interactive=interactive) for _ in range(num_components))
+def show_hop_slider(pitch_detection_algo: F0Method) -> gr.Slider:
+    """
+    Show or hide a slider component based on the given pitch extraction algorithm.
+    Parameters
+    ----------
+    pitch_detection_algo : F0Method
+        Pitch detection algorithm to determine visibility of the slider.
+    Returns
+    -------
+    gr.Slider
+        Slider component with visibility set accordingly.
+    """
+    if pitch_detection_algo == "mangio-crepe":
+        return gr.Slider(visible=True)
+    else:
+        return gr.Slider(visible=False)
+def update_song_cover_name(
+    mixed_vocals: str | None = None,
+    song_dir: str | None = None,
+    voice_model: str | None = None,
+    update_placeholder: bool = False,
+) -> gr.Textbox:
+    """
+    Updates a textbox component so that it displays a suitable name for a cover of
+    a given song.
+    If the path of an existing song directory is provided, the original song
+    name is inferred from that directory. If a voice model is not provided
+    but the path of an existing song directory and the path of a mixed vocals file
+    in that directory are provided, then the voice model is inferred from
+    the mixed vocals file.
+    Parameters
+    ----------
+    mixed_vocals : str, optional
+        The path to a mixed vocals file.
+    song_dir : str, optional
+        The path to a song directory.
+    voice_model : str, optional
+        The name of a voice model.
+    update_placeholder : bool, default=False
+        Whether to update the placeholder text of the textbox component.
+    Returns
+    -------
+    gr.Textbox
+        Updated textbox component.
+    """
+    update_args: TextBoxArgs = {}
+    update_key = "placeholder" if update_placeholder else "value"
+    if mixed_vocals or song_dir or voice_model:
+        name = exception_harness(get_song_cover_name)(
+            mixed_vocals, song_dir, voice_model, progress_bar=PROGRESS_BAR
+        )
+        update_args[update_key] = name
+    else:
+        update_args[update_key] = None
+    return gr.Textbox(**update_args)
+@dataclass
+class EventArgs:
+    """
+    Data class to store arguments for setting up event listeners.
+    Attributes
+    ----------
+    fn : Callable[..., Any]
+        Function to call when an event is triggered.
+    inputs : Sequence[Component], optional
+        Components to serve as inputs to the function.
+    outputs : Sequence[Component], optional
+        Components where to store the outputs of the function.
+    name : Literal["click", "success", "then"], default="success"
+        Name of the event to listen for.
+    show_progress : Literal["full", "minimal", "hidden"], default="full"
+        Level of progress bar to show when the event is triggered.
+    """
+    fn: Callable[..., Any]
+    inputs: Sequence[Component] | None = None
+    outputs: Sequence[Component] | None = None
+    name: Literal["click", "success", "then"] = "success"
+    show_progress: Literal["full", "minimal", "hidden"] = "full"
+def setup_consecutive_event_listeners(
+    component: Component, event_args_list: list[EventArgs]
+) -> Dependency | Component:
+    """
+    Set up a chain of event listeners on a component.
+    Parameters
+    ----------
+    component : Component
+        The component to set up event listeners on.
+    event_args_list : list[EventArgs]
+        List of event arguments to set up event listeners with.
+    Returns
+    -------
+    Dependency | Component
+        The last dependency in the chain of event listeners.
+    """
+    if len(event_args_list) == 0:
+        raise ValueError("Event args list must not be empty.")
+    dependency = component
+    for event_args in event_args_list:
+        event_listener = getattr(dependency, event_args.name)
+        dependency = event_listener(
+            event_args.fn,
+            inputs=event_args.inputs,
+            outputs=event_args.outputs,
+            show_progress=event_args.show_progress,
+        )
+    return dependency
+def setup_consecutive_event_listeners_with_toggled_interactivity(
+    component: Component,
+    event_args_list: list[EventArgs],
+    toggled_components: Sequence[Component],
+) -> Dependency | Component:
+    """
+    Set up a chain of event listeners on a component
+    with interactivity toggled for a set of other components.
+    While the chain of event listeners is being executed,
+    the other components are made non-interactive.
+    When the chain of event listeners is completed,
+    the other components are made interactive again.
+    Parameters
+    ----------
+    component : Component
+        The component to set up event listeners on.
+    event_args_list : list[EventArgs]
+        List of event arguments to set up event listeners with.
+    toggled_components : Sequence[Component]
+        Components to toggle interactivity for.
+    Returns
+    -------
+    Dependency | Component
+        The last dependency in the chain of event listeners.
+    """
+    if len(event_args_list) == 0:
+        raise ValueError("Event args list must not be empty.")
+    disable_event_args = EventArgs(
+        partial(_toggle_component_interactivity, len(toggled_components), False),
+        outputs=toggled_components,
+        name="click",
+        show_progress="hidden",
+    )
+    enable_event_args = EventArgs(
+        partial(_toggle_component_interactivity, len(toggled_components), True),
+        outputs=toggled_components,
+        name="then",
+        show_progress="hidden",
+    )
+    event_args_list_augmented = (
+        [disable_event_args] + event_args_list + [enable_event_args]
+    )
+    return setup_consecutive_event_listeners(component, event_args_list_augmented)

src/frontend/tabs/manage_audio.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""
+This module contains the code for the "Delete audio" tab.
+"""
+from functools import partial
+import gradio as gr
+from backend.manage_audio import (
+    delete_all_audio,
+    delete_all_intermediate_audio,
+    delete_all_output_audio,
+    delete_intermediate_audio,
+    delete_output_audio,
+)
+from frontend.common import (
+    PROGRESS_BAR,
+    confirm_box_js,
+    confirmation_harness,
+    identity,
+    update_cached_input_songs,
+    update_output_audio,
+)
+def render(
+    dummy_deletion_checkbox: gr.Checkbox,
+    delete_confirmation: gr.State,
+    song_dir_dropdowns: list[gr.Dropdown],
+    cached_input_songs_dropdown_1click: gr.Dropdown,
+    cached_input_songs_dropdown_multi: gr.Dropdown,
+    intermediate_audio_to_delete: gr.Dropdown,
+    output_audio_to_delete: gr.Dropdown,
+) -> None:
+    """
+    Render "Delete audio" tab.
+    Parameters
+    ----------
+    dummy_deletion_checkbox : gr.Checkbox
+        Dummy component needed for deletion confirmation in the
+        "Delete audio" tab and the "Manage models" tab.
+    delete_confirmation : gr.State
+        Component storing deletion confirmation status in the
+        "Delete audio" tab and the "Manage models" tab.
+    song_dir_dropdowns : list[gr.Dropdown]
+        Dropdowns for selecting song directories in the
+        "Multi-step generation" tab.
+    cached_input_songs_dropdown_1click : gr.Dropdown
+        Dropdown for selecting cached input songs in the
+        "One-click generation" tab
+    cached_input_songs_dropdown_multi : gr.Dropdown
+        Dropdown for selecting cached input songs in the
+        "Multi-step generation" tab
+    intermediate_audio_to_delete : gr.Dropdown
+        Dropdown for selecting intermediate audio files to delete in the
+        "Delete audio" tab.
+    output_audio_to_delete : gr.Dropdown
+        Dropdown for selecting output audio files to delete in the
+        "Delete audio" tab.
+    """
+    with gr.Tab("Delete audio"):
+        with gr.Accordion("Intermediate audio", open=False):
+            with gr.Row():
+                with gr.Column():
+                    intermediate_audio_to_delete.render()
+                    delete_intermediate_audio_btn = gr.Button(
+                        "Delete selected", variant="secondary"
+                    )
+                    delete_all_intermediate_audio_btn = gr.Button(
+                        "Delete all", variant="primary"
+                    )
+                with gr.Row():
+                    intermediate_audio_delete_msg = gr.Textbox(
+                        label="Output message", interactive=False
+                    )
+        with gr.Accordion("Output audio", open=False):
+            with gr.Row():
+                with gr.Column():
+                    output_audio_to_delete.render()
+                    delete_output_audio_btn = gr.Button(
+                        "Delete selected", variant="secondary"
+                    )
+                    delete_all_output_audio_btn = gr.Button(
+                        "Delete all", variant="primary"
+                    )
+                with gr.Row():
+                    output_audio_delete_msg = gr.Textbox(
+                        label="Output message", interactive=False
+                    )
+        with gr.Accordion("All audio", open=True):
+            with gr.Row():
+                delete_all_audio_btn = gr.Button("Delete", variant="primary")
+                delete_all_audio_msg = gr.Textbox(
+                    label="Output message", interactive=False
+                )
+        delete_intermediate_audio_click = delete_intermediate_audio_btn.click(
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js(
+                "Are you sure you want to delete intermediate audio files for the"
+                " selected songs?"
+            ),
+            show_progress="hidden",
+        ).then(
+            partial(
+                confirmation_harness(delete_intermediate_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[delete_confirmation, intermediate_audio_to_delete],
+            outputs=intermediate_audio_delete_msg,
+        )
+        delete_all_intermediate_audio_click = delete_all_intermediate_audio_btn.click(
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js(
+                "Are you sure you want to delete all intermediate audio files?"
+            ),
+            show_progress="hidden",
+        ).then(
+            partial(
+                confirmation_harness(delete_all_intermediate_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=delete_confirmation,
+            outputs=intermediate_audio_delete_msg,
+        )
+        delete_output_audio_click = delete_output_audio_btn.click(
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js(
+                "Are you sure you want to delete the selected output audio files?"
+            ),
+            show_progress="hidden",
+        ).then(
+            partial(
+                confirmation_harness(delete_output_audio),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[delete_confirmation, output_audio_to_delete],
+            outputs=output_audio_delete_msg,
+        )
+        delete_all_output_audio_click = delete_all_output_audio_btn.click(
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js(
+                "Are you sure you want to delete all output audio files?"
+            ),
+            show_progress="hidden",
+        ).then(
+            partial(
+                confirmation_harness(delete_all_output_audio), progress_bar=PROGRESS_BAR
+            ),
+            inputs=delete_confirmation,
+            outputs=output_audio_delete_msg,
+        )
+        delete_all_audio_click = delete_all_audio_btn.click(
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js("Are you sure you want to delete all audio files?"),
+            show_progress="hidden",
+        ).then(
+            partial(confirmation_harness(delete_all_audio), progress_bar=PROGRESS_BAR),
+            inputs=delete_confirmation,
+            outputs=delete_all_audio_msg,
+        )
+        for click_event in [
+            delete_intermediate_audio_click,
+            delete_all_intermediate_audio_click,
+        ]:
+            click_event.success(
+                partial(
+                    update_cached_input_songs, 3 + len(song_dir_dropdowns), [], [0]
+                ),
+                outputs=[
+                    intermediate_audio_to_delete,
+                    cached_input_songs_dropdown_1click,
+                    cached_input_songs_dropdown_multi,
+                    *song_dir_dropdowns,
+                ],
+                show_progress="hidden",
+            )
+        for click_event in [delete_output_audio_click, delete_all_output_audio_click]:
+            click_event.success(
+                partial(update_output_audio, 1, [], [0]),
+                outputs=[output_audio_to_delete],
+                show_progress="hidden",
+            )
+        delete_all_audio_click.success(
+            partial(update_output_audio, 1, [], [0]),
+            outputs=[output_audio_to_delete],
+            show_progress="hidden",
+        ).then(
+            partial(update_cached_input_songs, 3 + len(song_dir_dropdowns), [], [0]),
+            outputs=[
+                intermediate_audio_to_delete,
+                cached_input_songs_dropdown_1click,
+                cached_input_songs_dropdown_multi,
+                *song_dir_dropdowns,
+            ],
+            show_progress="hidden",
+        )

src/frontend/tabs/manage_models.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+This module contains the code for the "Manage models" tab.
+"""
+from typings.extra import DropdownValue
+from functools import partial
+import gradio as gr
+import pandas as pd
+from backend.manage_voice_models import (
+    delete_all_models,
+    delete_models,
+    download_online_model,
+    filter_public_models_table,
+    get_current_models,
+    load_public_model_tags,
+    load_public_models_table,
+    upload_local_model,
+)
+from frontend.common import (
+    PROGRESS_BAR,
+    confirm_box_js,
+    confirmation_harness,
+    exception_harness,
+    identity,
+    update_dropdowns,
+)
+def _update_model_lists(
+    num_components: int, value: DropdownValue = None, value_indices: list[int] = []
+) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
+    """
+    Updates the choices of one or more dropdown
+    components to the current set of voice models.
+    Optionally updates the default value of one or more of these components.
+    Parameters
+    ----------
+    num_components : int
+        Number of dropdown components to update.
+    value : DropdownValue, optional
+        New value for dropdown components.
+    value_indices : list[int], default=[]
+        Indices of dropdown components to update the value for.
+    Returns
+    -------
+    gr.Dropdown | tuple[gr.Dropdown, ...]
+        Updated dropdown component or components.
+    """
+    return update_dropdowns(get_current_models, num_components, value, value_indices)
+def _filter_public_models_table_harness(
+    tags: list[str], query: str, progress_bar: gr.Progress
+) -> gr.Dataframe:
+    """
+    Filter the public models table based on tags and search query.
+    Parameters
+    ----------
+    tags : list[str]
+        Tags to filter the table by.
+    query : str
+        Search query to filter the table by.
+    progress_bar : gr.Progress
+        Progress bar to display progress.
+    Returns
+    -------
+    gr.Dataframe
+        The filtered public models table rendered in a Gradio dataframe.
+    """
+    models_table = filter_public_models_table(tags, query, progress_bar)
+    return gr.Dataframe(value=models_table)
+def _pub_dl_autofill(
+    pub_models: pd.DataFrame, event: gr.SelectData
+) -> tuple[gr.Textbox, gr.Textbox]:
+    """
+    Autofill download link and model name based on selected row in public models table.
+    Parameters
+    ----------
+    pub_models : pd.DataFrame
+        Public models table.
+    event : gr.SelectData
+        Event containing the selected row.
+    Returns
+    -------
+    download_link : gr.Textbox
+        Autofilled download link.
+    model_name : gr.Textbox
+        Autofilled model name.
+    """
+    event_index = event.index[0]
+    url_str = pub_models.loc[event_index, "URL"]
+    model_str = pub_models.loc[event_index, "Model Name"]
+    return gr.Textbox(value=url_str), gr.Textbox(value=model_str)
+def render(
+    dummy_deletion_checkbox: gr.Checkbox,
+    delete_confirmation: gr.State,
+    rvc_models_to_delete: gr.Dropdown,
+    rvc_model_1click: gr.Dropdown,
+    rvc_model_multi: gr.Dropdown,
+) -> None:
+    """
+    Render "Manage models" tab.
+    Parameters
+    ----------
+    dummy_deletion_checkbox : gr.Checkbox
+        Dummy component needed for deletion confirmation in the
+        "Manage audio" tab and the "Manage models" tab.
+    delete_confirmation : gr.State
+        Component storing deletion confirmation status in the
+        "Manage audio" tab and the "Manage models" tab.
+    rvc_models_to_delete : gr.Dropdown
+        Dropdown for selecting models to delete in the
+        "Manage models" tab.
+    rvc_model_1click : gr.Dropdown
+        Dropdown for selecting models in the "One-click generation" tab.
+    rvc_model_multi : gr.Dropdown
+        Dropdown for selecting models in the "Multi-step generation" tab.
+    """
+    # Download tab
+    with gr.Tab("Download model"):
+        with gr.Accordion("View public models table", open=False):
+            gr.Markdown("")
+            gr.Markdown("HOW TO USE")
+            gr.Markdown("- Filter models using tags or search bar")
+            gr.Markdown("- Select a row to autofill the download link and model name")
+            filter_tags = gr.CheckboxGroup(
+                value=[],
+                label="Show voice models with tags",
+                choices=load_public_model_tags(),
+            )
+            search_query = gr.Textbox(label="Search")
+            public_models_table = gr.DataFrame(
+                value=load_public_models_table([]),
+                headers=["Model Name", "Description", "Tags", "Credit", "Added", "URL"],
+                label="Available Public Models",
+                interactive=False,
+            )
+        with gr.Row():
+            model_zip_link = gr.Textbox(
+                label="Download link to model",
+                info=(
+                    "Should point to a zip file containing a .pth model file and an"
+                    " optional .index file."
+                ),
+            )
+            model_name = gr.Textbox(
+                label="Model name", info="Enter a unique name for the model."
+            )
+        with gr.Row():
+            download_btn = gr.Button("Download 🌐", variant="primary", scale=19)
+            dl_output_message = gr.Textbox(
+                label="Output message", interactive=False, scale=20
+            )
+        download_button_click = download_btn.click(
+            partial(
+                exception_harness(download_online_model), progress_bar=PROGRESS_BAR
+            ),
+            inputs=[model_zip_link, model_name],
+            outputs=dl_output_message,
+        )
+        public_models_table.select(
+            _pub_dl_autofill,
+            inputs=public_models_table,
+            outputs=[model_zip_link, model_name],
+            show_progress="hidden",
+        )
+        search_query.change(
+            partial(
+                exception_harness(_filter_public_models_table_harness),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[filter_tags, search_query],
+            outputs=public_models_table,
+            show_progress="hidden",
+        )
+        filter_tags.select(
+            partial(
+                exception_harness(_filter_public_models_table_harness),
+                progress_bar=PROGRESS_BAR,
+            ),
+            inputs=[filter_tags, search_query],
+            outputs=public_models_table,
+            show_progress="hidden",
+        )
+    # Upload tab
+    with gr.Tab("Upload model"):
+        with gr.Accordion("HOW TO USE"):
+            gr.Markdown(
+                "- Find locally trained RVC v2 model file (weights folder) and optional"
+                " index file (logs/[name] folder)"
+            )
+            gr.Markdown(
+                "- Upload model file and optional index file directly or compress into"
+                " a zip file and upload that"
+            )
+            gr.Markdown("- Enter a unique name for the model")
+            gr.Markdown("- Click 'Upload model'")
+        with gr.Row():
+            with gr.Column():
+                model_files = gr.File(label="Files", file_count="multiple")
+            local_model_name = gr.Textbox(label="Model name")
+        with gr.Row():
+            model_upload_button = gr.Button("Upload model", variant="primary", scale=19)
+            local_upload_output_message = gr.Textbox(
+                label="Output message", interactive=False, scale=20
+            )
+            model_upload_button_click = model_upload_button.click(
+                partial(
+                    exception_harness(upload_local_model), progress_bar=PROGRESS_BAR
+                ),
+                inputs=[model_files, local_model_name],
+                outputs=local_upload_output_message,
+            )
+    with gr.Tab("Delete models"):
+        with gr.Row():
+            with gr.Column():
+                rvc_models_to_delete.render()
+            with gr.Column():
+                rvc_models_deleted_message = gr.Textbox(
+                    label="Output message", interactive=False
+                )
+        with gr.Row():
+            with gr.Column():
+                delete_models_button = gr.Button(
+                    "Delete selected models", variant="secondary"
+                )
+                delete_all_models_button = gr.Button(
+                    "Delete all models", variant="primary"
+                )
+            with gr.Column():
+                pass
+        delete_models_button_click = delete_models_button.click(
+            # NOTE not sure why, but in order for subsequent event listener
+            # to trigger, changes coming from the js code
+            # have to be routed through an identity function which takes as
+            # input some dummy component of type bool.
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js("Are you sure you want to delete the selected models?"),
+            show_progress="hidden",
+        ).then(
+            partial(confirmation_harness(delete_models), progress_bar=PROGRESS_BAR),
+            inputs=[delete_confirmation, rvc_models_to_delete],
+            outputs=rvc_models_deleted_message,
+        )
+        delete_all_models_btn_click = delete_all_models_button.click(
+            identity,
+            inputs=dummy_deletion_checkbox,
+            outputs=delete_confirmation,
+            js=confirm_box_js("Are you sure you want to delete all models?"),
+            show_progress="hidden",
+        ).then(
+            partial(confirmation_harness(delete_all_models), progress_bar=PROGRESS_BAR),
+            inputs=delete_confirmation,
+            outputs=rvc_models_deleted_message,
+        )
+    for click_event in [
+        download_button_click,
+        model_upload_button_click,
+        delete_models_button_click,
+        delete_all_models_btn_click,
+    ]:
+        click_event.success(
+            partial(_update_model_lists, 3, [], [2]),
+            outputs=[rvc_model_1click, rvc_model_multi, rvc_models_to_delete],
+            show_progress="hidden",
+        )

src/frontend/tabs/multi_step_generation.py ADDED Viewed

	@@ -0,0 +1,991 @@

+"""
+This module contains the code for the "Multi-step generation" tab.
+"""
+from typings.extra import TransferUpdateArgs
+from functools import partial
+import gradio as gr
+from backend.generate_song_cover import (
+    convert_vocals,
+    dereverb_vocals,
+    mix_song_cover,
+    pitch_shift_background,
+    postprocess_vocals,
+    retrieve_song,
+    separate_main_vocals,
+    separate_vocals,
+)
+from frontend.common import (
+    PROGRESS_BAR,
+    EventArgs,
+    exception_harness,
+    setup_consecutive_event_listeners_with_toggled_interactivity,
+    show_hop_slider,
+    toggle_visible_component,
+    update_cached_input_songs,
+    update_output_audio,
+    update_song_cover_name,
+    update_value,
+)
+def _update_audio(
+    num_components: int, output_indices: list[int], file_path: str
+) -> gr.Audio | tuple[gr.Audio, ...]:
+    """
+    Update the value of a subset of `Audio` components to the given audio file path.
+    Parameters
+    ----------
+    num_components : int
+        The total number of `Audio` components under consideration.
+    output_indices : list[int]
+        Indices of `Audio` components to update the value for.
+    file_path : str
+        Path pointing to an audio track to update the indexed `Audio` components with.
+    Returns
+    -------
+    gr.Audio | tuple[gr.Audio, ...]
+        Each `Audio` component under consideration
+        with indexed components updated to the given audio file path.
+    """
+    update_args: list[TransferUpdateArgs] = [{} for _ in range(num_components)]
+    for index in output_indices:
+        update_args[index]["value"] = file_path
+    if num_components == 1:
+        return gr.Audio(**update_args[0])
+    return tuple(gr.Audio(**update_arg) for update_arg in update_args)
+def render(
+    generate_buttons: list[gr.Button],
+    song_dir_dropdowns: list[gr.Dropdown],
+    cached_input_songs_dropdown_1click: gr.Dropdown,
+    cached_input_songs_dropdown_multi: gr.Dropdown,
+    rvc_model: gr.Dropdown,
+    intermediate_audio_to_delete: gr.Dropdown,
+    output_audio_to_remove: gr.Dropdown,
+) -> None:
+    """
+    Render "Multi-step generation" tab.
+    Parameters
+    ----------
+    generate_buttons : list[gr.Button]
+        Buttons used for audio generation in the
+        "One-click generation" tab and the "Multi-step generation" tab.
+    song_dir_dropdowns : list[gr.Dropdown]
+        Dropdowns for selecting song directories in the
+        "Multi-step generation" tab.
+    cached_input_songs_dropdown_1click : gr.Dropdown
+        Dropdown for selecting cached input songs in the
+        "One-click generation" tab.
+    cached_input_songs_dropdown_multi : gr.Dropdown
+        Dropdown for selecting cached input songs in the
+        "Multi-step generation" tab.
+    rvc_model : gr.Dropdown
+        Dropdown for selecting voice models in the
+        "Multi-step generation" tab.
+    intermediate_audio_to_delete : gr.Dropdown
+        Dropdown for selecting intermediate audio files to delete in the
+        "Delete audio" tab.
+    output_audio_to_remove : gr.Dropdown
+        Dropdown for selecting output audio files to delete in the
+        "Delete audio" tab.
+    """
+    with gr.Tab("Multi-step generation"):
+        (
+            retrieve_song_btn,
+            separate_vocals_btn,
+            separate_main_vocals_btn,
+            dereverb_vocals_btn,
+            convert_vocals_btn,
+            postprocess_vocals_btn,
+            pitch_shift_background_btn,
+            mix_btn,
+            _,
+        ) = generate_buttons
+        (
+            separate_vocals_dir,
+            separate_main_vocals_dir,
+            dereverb_vocals_dir,
+            convert_vocals_dir,
+            postprocess_vocals_dir,
+            pitch_shift_background_dir,
+            mix_dir,
+        ) = song_dir_dropdowns
+        current_song_dir = gr.State(None)
+        (
+            original_track_output,
+            vocals_track_output,
+            instrumentals_track_output,
+            main_vocals_track_output,
+            backup_vocals_track_output,
+            dereverbed_vocals_track_output,
+            reverb_track_output,
+            converted_vocals_track_output,
+            postprocessed_vocals_track_output,
+            shifted_instrumentals_track_output,
+            shifted_backup_vocals_track_output,
+            song_cover_track,
+        ) = [
+            gr.Audio(label=label, type="filepath", interactive=False, render=False)
+            for label in [
+                "Input song",
+                "Vocals",
+                "Instrumentals",
+                "Main vocals",
+                "Backup vocals",
+                "De-reverbed vocals",
+                "Reverb",
+                "Converted vocals",
+                "Post-processed vocals",
+                "Pitch-shifted instrumentals",
+                "Pitch-shifted backup vocals",
+                "Song cover",
+            ]
+        ]
+        input_tracks = [
+            gr.Audio(label=label, type="filepath", render=False)
+            for label in [
+                "Input song",
+                "Vocals",
+                "Vocals",
+                "Vocals",
+                "Vocals",
+                "Instrumentals",
+                "Backup vocals",
+                "Main vocals",
+                "Instrumentals",
+                "Backup vocals",
+            ]
+        ]
+        (
+            original_track_input,
+            vocals_track_input,
+            main_vocals_track_input,
+            dereverbed_vocals_track_input,
+            converted_vocals_track_input,
+            instrumentals_track_input,
+            backup_vocals_track_input,
+            postprocessed_vocals_track_input,
+            shifted_instrumentals_track_input,
+            shifted_backup_vocals_track_input,
+        ) = input_tracks
+        transfer_defaults = [
+            ["Step 1: input song"],
+            ["Step 2: vocals"],
+            ["Step 6: instrumentals"],
+            ["Step 3: vocals"],
+            ["Step 6: backup vocals"],
+            ["Step 4: vocals"],
+            [],
+            ["Step 5: vocals"],
+            ["Step 7: main vocals"],
+            ["Step 7: instrumentals"],
+            ["Step 7: backup vocals"],
+            [],
+        ]
+        (
+            original_track_transfer_default,
+            vocals_track_transfer_default,
+            instrumentals_track_transfer_default,
+            main_vocals_track_transfer_default,
+            backup_vocals_track_transfer_default,
+            dereverbed_vocals_track_transfer_default,
+            reverb_track_transfer_default,
+            converted_vocals_track_transfer_default,
+            postprocessed_vocals_track_transfer_default,
+            shifted_instrumentals_track_transfer_default,
+            shifted_backup_vocals_track_transfer_default,
+            song_cover_track_transfer_default,
+        ) = transfer_defaults
+        transfer_output_track_dropdowns = [
+            gr.Dropdown(
+                [
+                    "Step 1: input song",
+                    "Step 2: vocals",
+                    "Step 3: vocals",
+                    "Step 4: vocals",
+                    "Step 5: vocals",
+                    "Step 6: instrumentals",
+                    "Step 6: backup vocals",
+                    "Step 7: main vocals",
+                    "Step 7: instrumentals",
+                    "Step 7: backup vocals",
+                ],
+                label="Transfer to",
+                info=(
+                    "Select the input track(s) to transfer the output track to once"
+                    " generation completes."
+                ),
+                render=False,
+                type="index",
+                multiselect=True,
+                value=value,
+            )
+            for value in transfer_defaults
+        ]
+        (
+            original_track_transfer_dropdown,
+            vocals_track_transfer_dropdown,
+            instrumentals_track_transfer_dropdown,
+            main_vocals_track_transfer_dropdown,
+            backup_vocals_track_transfer_dropdown,
+            dereverbed_vocals_track_transfer_dropdown,
+            reverb_track_transfer_dropdown,
+            converted_vocals_track_transfer_dropdown,
+            postprocessed_vocals_track_transfer_dropdown,
+            shifted_instrumentals_track_transfer_dropdown,
+            shifted_backup_vocals_track_transfer_dropdown,
+            song_cover_track_transfer_dropdown,
+        ) = transfer_output_track_dropdowns
+        clear_btns = [gr.Button(value="Reset settings", render=False) for _ in range(8)]
+        (
+            retrieve_song_clear_btn,
+            separate_vocals_clear_btn,
+            separate_main_vocals_clear_btn,
+            dereverb_vocals_clear_btn,
+            convert_vocals_clear_btn,
+            postprocess_vocals_clear_btn,
+            pitch_shift_background_clear_btn,
+            mix_clear_btn,
+        ) = clear_btns
+        with gr.Accordion("Step 0: song retrieval", open=True):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            with gr.Row():
+                with gr.Column():
+                    song_input_type_dropdown = gr.Dropdown(
+                        [
+                            "YouTube link/local path",
+                            "Local file/microphone",
+                            "Cached song",
+                        ],
+                        value="YouTube link/local path",
+                        label="Song input type",
+                        type="index",
+                    )
+                with gr.Column():
+                    song_input = gr.Textbox(
+                        label="Song input",
+                        info=(
+                            "Link to a song on YouTube or the full path of a local"
+                            " audio file."
+                        ),
+                    )
+                    local_file = gr.Audio(
+                        label="Song input", type="filepath", visible=False
+                    )
+                    cached_input_songs_dropdown_multi.render()
+                song_input_type_dropdown.input(
+                    partial(toggle_visible_component, 3),
+                    inputs=song_input_type_dropdown,
+                    outputs=[song_input, local_file, cached_input_songs_dropdown_multi],
+                    show_progress="hidden",
+                )
+                local_file.change(
+                    update_value,
+                    inputs=local_file,
+                    outputs=song_input,
+                    show_progress="hidden",
+                )
+                cached_input_songs_dropdown_multi.input(
+                    update_value,
+                    inputs=cached_input_songs_dropdown_multi,
+                    outputs=song_input,
+                    show_progress="hidden",
+                )
+            gr.Markdown("**Outputs**")
+            original_track_output.render()
+            original_track_transfer_dropdown.render()
+            retrieve_song_clear_btn.render()
+            retrieve_song_clear_btn.click(
+                lambda: gr.Dropdown(value=original_track_transfer_default),
+                outputs=[original_track_transfer_dropdown],
+                show_progress="hidden",
+            )
+            retrieve_song_btn.render()
+            retrieve_song_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(retrieve_song), progress_bar=PROGRESS_BAR
+                    ),
+                    inputs=[song_input],
+                    outputs=[original_track_output, current_song_dir],
+                ),
+                EventArgs(
+                    partial(
+                        update_cached_input_songs,
+                        len(song_dir_dropdowns) + 2,
+                        value_indices=range(len(song_dir_dropdowns) + 1),
+                    ),
+                    inputs=[current_song_dir],
+                    outputs=(
+                        song_dir_dropdowns
+                        + [
+                            cached_input_songs_dropdown_multi,
+                            cached_input_songs_dropdown_1click,
+                        ]
+                    ),
+                    name="then",
+                    show_progress="hidden",
+                ),
+                EventArgs(
+                    partial(update_cached_input_songs, 1, [], [0]),
+                    outputs=[intermediate_audio_to_delete],
+                    name="then",
+                    show_progress="hidden",
+                ),
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[original_track_transfer_dropdown, original_track_output],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                ),
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                retrieve_song_btn,
+                retrieve_song_event_args_list,
+                generate_buttons,
+            )
+        with gr.Accordion("Step 1: vocals/instrumentals separation", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            original_track_input.render()
+            separate_vocals_dir.render()
+            gr.Markdown("**Outputs**")
+            with gr.Row():
+                with gr.Column():
+                    vocals_track_output.render()
+                    vocals_track_transfer_dropdown.render()
+                with gr.Column():
+                    instrumentals_track_output.render()
+                    instrumentals_track_transfer_dropdown.render()
+            separate_vocals_clear_btn.render()
+            separate_vocals_clear_btn.click(
+                lambda: tuple(
+                    gr.Dropdown(value=value)
+                    for value in [
+                        vocals_track_transfer_default,
+                        instrumentals_track_transfer_default,
+                    ]
+                ),
+                outputs=[
+                    vocals_track_transfer_dropdown,
+                    instrumentals_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            separate_vocals_btn.render()
+            separate_vocals_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(separate_vocals), progress_bar=PROGRESS_BAR
+                    ),
+                    inputs=[original_track_input, separate_vocals_dir],
+                    outputs=[vocals_track_output, instrumentals_track_output],
+                )
+            ] + [
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[transfer_dropdown, output_track],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                )
+                for transfer_dropdown, output_track in zip(
+                    [
+                        vocals_track_transfer_dropdown,
+                        instrumentals_track_transfer_dropdown,
+                    ],
+                    [vocals_track_output, instrumentals_track_output],
+                )
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                separate_vocals_btn,
+                separate_vocals_event_args_list,
+                generate_buttons,
+            )
+        with gr.Accordion("Step 2: main vocals/ backup vocals separation", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            vocals_track_input.render()
+            separate_main_vocals_dir.render()
+            gr.Markdown("**Outputs**")
+            with gr.Row():
+                with gr.Column():
+                    main_vocals_track_output.render()
+                    main_vocals_track_transfer_dropdown.render()
+                with gr.Column():
+                    backup_vocals_track_output.render()
+                    backup_vocals_track_transfer_dropdown.render()
+            separate_main_vocals_clear_btn.render()
+            separate_main_vocals_clear_btn.click(
+                lambda: tuple(
+                    gr.Dropdown(value=value)
+                    for value in [
+                        main_vocals_track_transfer_default,
+                        backup_vocals_track_transfer_default,
+                    ]
+                ),
+                outputs=[
+                    main_vocals_track_transfer_dropdown,
+                    backup_vocals_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            separate_main_vocals_btn.render()
+            separate_main_vocals_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(separate_main_vocals),
+                        progress_bar=PROGRESS_BAR,
+                    ),
+                    inputs=[vocals_track_input, separate_main_vocals_dir],
+                    outputs=[main_vocals_track_output, backup_vocals_track_output],
+                )
+            ] + [
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[transfer_dropdown, output_track],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                )
+                for transfer_dropdown, output_track in zip(
+                    [
+                        main_vocals_track_transfer_dropdown,
+                        backup_vocals_track_transfer_dropdown,
+                    ],
+                    [main_vocals_track_output, backup_vocals_track_output],
+                )
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                separate_main_vocals_btn,
+                separate_main_vocals_event_args_list,
+                generate_buttons,
+            )
+        with gr.Accordion("Step 3: vocal cleanup", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            main_vocals_track_input.render()
+            dereverb_vocals_dir.render()
+            gr.Markdown("**Outputs**")
+            with gr.Row():
+                with gr.Column():
+                    dereverbed_vocals_track_output.render()
+                    dereverbed_vocals_track_transfer_dropdown.render()
+                with gr.Column():
+                    reverb_track_output.render()
+                    reverb_track_transfer_dropdown.render()
+            dereverb_vocals_clear_btn.render()
+            dereverb_vocals_clear_btn.click(
+                lambda: tuple(
+                    gr.Dropdown(value=value)
+                    for value in [
+                        dereverbed_vocals_track_transfer_default,
+                        reverb_track_transfer_default,
+                    ]
+                ),
+                outputs=[
+                    dereverbed_vocals_track_transfer_dropdown,
+                    reverb_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            dereverb_vocals_btn.render()
+            dereverb_vocals_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(dereverb_vocals), progress_bar=PROGRESS_BAR
+                    ),
+                    inputs=[main_vocals_track_input, dereverb_vocals_dir],
+                    outputs=[dereverbed_vocals_track_output, reverb_track_output],
+                )
+            ] + [
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[transfer_dropdown, output_track],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                )
+                for transfer_dropdown, output_track in zip(
+                    [
+                        dereverbed_vocals_track_transfer_dropdown,
+                        reverb_track_transfer_dropdown,
+                    ],
+                    [dereverbed_vocals_track_output, reverb_track_output],
+                )
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                dereverb_vocals_btn, dereverb_vocals_event_args_list, generate_buttons
+            )
+        with gr.Accordion("Step 4: vocal conversion", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            dereverbed_vocals_track_input.render()
+            convert_vocals_dir.render()
+            with gr.Row():
+                rvc_model.render()
+                pitch_change_octaves = gr.Slider(
+                    -3,
+                    3,
+                    value=0,
+                    step=1,
+                    label="Pitch shift (octaves)",
+                    info=(
+                        "Shift pitch of converted vocals by number of octaves."
+                        " Generally, use 1 for male-to-female conversions and -1 for"
+                        " vice-versa."
+                    ),
+                )
+                pitch_change_semitones = gr.Slider(
+                    -12,
+                    12,
+                    value=0,
+                    step=1,
+                    label="Pitch shift (semi-tones)",
+                    info=(
+                        "Shift pitch of converted vocals by number of semi-tones."
+                        " Altering this slightly reduces sound quality."
+                    ),
+                )
+            with gr.Row():
+                index_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.5,
+                    label="Index rate",
+                    info=(
+                        "Controls how much of the accent in the voice model to keep in"
+                        " the converted vocals"
+                    ),
+                )
+                filter_radius = gr.Slider(
+                    0,
+                    7,
+                    value=3,
+                    step=1,
+                    label="Filter radius",
+                    info=(
+                        "If >=3: apply median filtering to the harvested pitch results."
+                        " Can reduce breathiness"
+                    ),
+                )
+                rms_mix_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.25,
+                    label="RMS mix rate",
+                    info=(
+                        "Control how much to mimic the loudness (0) of the input vocals"
+                        " or a fixed loudness (1)"
+                    ),
+                )
+                protect = gr.Slider(
+                    0,
+                    0.5,
+                    value=0.33,
+                    label="Protect rate",
+                    info=(
+                        "Protect voiceless consonants and breath sounds. Set to 0.5 to"
+                        " disable."
+                    ),
+                )
+                with gr.Column():
+                    f0_method = gr.Dropdown(
+                        ["rmvpe", "mangio-crepe"],
+                        value="rmvpe",
+                        label="Pitch detection algorithm",
+                        info=(
+                            "Best option is rmvpe (clarity in vocals), then"
+                            " mangio-crepe (smoother vocals)"
+                        ),
+                    )
+                    crepe_hop_length = gr.Slider(
+                        32,
+                        320,
+                        value=128,
+                        step=1,
+                        visible=False,
+                        label="Crepe hop length",
+                        info=(
+                            "Lower values leads to longer conversions and higher risk"
+                            " of voice cracks, but better pitch accuracy."
+                        ),
+                    )
+                    f0_method.change(
+                        show_hop_slider,
+                        inputs=f0_method,
+                        outputs=crepe_hop_length,
+                        show_progress="hidden",
+                    )
+            gr.Markdown("**Outputs**")
+            converted_vocals_track_output.render()
+            converted_vocals_track_transfer_dropdown.render()
+            convert_vocals_clear_btn.render()
+            convert_vocals_clear_btn.click(
+                lambda: [
+                    0,
+                    0,
+                    0.5,
+                    3,
+                    0.25,
+                    0.33,
+                    "rmvpe",
+                    128,
+                    gr.Dropdown(value=converted_vocals_track_transfer_default),
+                ],
+                outputs=[
+                    pitch_change_octaves,
+                    pitch_change_semitones,
+                    index_rate,
+                    filter_radius,
+                    rms_mix_rate,
+                    protect,
+                    f0_method,
+                    crepe_hop_length,
+                    converted_vocals_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            convert_vocals_btn.render()
+            convert_vocals_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(convert_vocals), progress_bar=PROGRESS_BAR
+                    ),
+                    inputs=[
+                        dereverbed_vocals_track_input,
+                        convert_vocals_dir,
+                        rvc_model,
+                        pitch_change_octaves,
+                        pitch_change_semitones,
+                        index_rate,
+                        filter_radius,
+                        rms_mix_rate,
+                        protect,
+                        f0_method,
+                        crepe_hop_length,
+                    ],
+                    outputs=[converted_vocals_track_output],
+                ),
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[
+                        converted_vocals_track_transfer_dropdown,
+                        converted_vocals_track_output,
+                    ],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                ),
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                convert_vocals_btn, convert_vocals_event_args_list, generate_buttons
+            )
+        with gr.Accordion("Step 5: post-processing of vocals", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            converted_vocals_track_input.render()
+            postprocess_vocals_dir.render()
+            with gr.Row():
+                reverb_rm_size = gr.Slider(
+                    0,
+                    1,
+                    value=0.15,
+                    label="Room size",
+                    info="The larger the room, the longer the reverb time",
+                )
+                reverb_wet = gr.Slider(
+                    0,
+                    1,
+                    value=0.2,
+                    label="Wetness level",
+                    info="Loudness level of converted vocals with reverb",
+                )
+                reverb_dry = gr.Slider(
+                    0,
+                    1,
+                    value=0.8,
+                    label="Dryness level",
+                    info="Loudness level of converted vocals without reverb",
+                )
+                reverb_damping = gr.Slider(
+                    0,
+                    1,
+                    value=0.7,
+                    label="Damping level",
+                    info="Absorption of high frequencies in the reverb",
+                )
+            gr.Markdown("**Outputs**")
+            postprocessed_vocals_track_output.render()
+            postprocessed_vocals_track_transfer_dropdown.render()
+            postprocess_vocals_clear_btn.render()
+            postprocess_vocals_clear_btn.click(
+                lambda: [
+                    0.15,
+                    0.2,
+                    0.8,
+                    0.7,
+                    gr.Dropdown(value=postprocessed_vocals_track_transfer_default),
+                ],
+                outputs=[
+                    reverb_rm_size,
+                    reverb_wet,
+                    reverb_dry,
+                    reverb_damping,
+                    postprocessed_vocals_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            postprocess_vocals_btn.render()
+            postprocess_vocals_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(postprocess_vocals),
+                        progress_bar=PROGRESS_BAR,
+                    ),
+                    inputs=[
+                        converted_vocals_track_input,
+                        postprocess_vocals_dir,
+                        reverb_rm_size,
+                        reverb_wet,
+                        reverb_dry,
+                        reverb_damping,
+                    ],
+                    outputs=[postprocessed_vocals_track_output],
+                ),
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[
+                        postprocessed_vocals_track_transfer_dropdown,
+                        postprocessed_vocals_track_output,
+                    ],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                ),
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                postprocess_vocals_btn,
+                postprocess_vocals_event_args_list,
+                generate_buttons,
+            )
+        with gr.Accordion("Step 6: pitch shift of background tracks", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            with gr.Row():
+                instrumentals_track_input.render()
+                backup_vocals_track_input.render()
+            pitch_shift_background_dir.render()
+            pitch_change_semitones_background = gr.Slider(
+                -12,
+                12,
+                value=0,
+                step=1,
+                label="Pitch shift",
+                info=(
+                    "Shift pitch of instrumentals and backup vocals. Measured in"
+                    " semi-tones."
+                ),
+            )
+            gr.Markdown("**Outputs**")
+            with gr.Row():
+                with gr.Column():
+                    shifted_instrumentals_track_output.render()
+                    shifted_instrumentals_track_transfer_dropdown.render()
+                with gr.Column():
+                    shifted_backup_vocals_track_output.render()
+                    shifted_backup_vocals_track_transfer_dropdown.render()
+            pitch_shift_background_clear_btn.render()
+            pitch_shift_background_clear_btn.click(
+                lambda: [
+                    0,
+                    gr.Dropdown(value=shifted_instrumentals_track_transfer_default),
+                    gr.Dropdown(value=shifted_backup_vocals_track_transfer_default),
+                ],
+                outputs=[
+                    pitch_change_semitones_background,
+                    shifted_instrumentals_track_transfer_dropdown,
+                    shifted_backup_vocals_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            pitch_shift_background_btn.render()
+            pitch_shift_background_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(pitch_shift_background),
+                        progress_bar=PROGRESS_BAR,
+                    ),
+                    inputs=[
+                        instrumentals_track_input,
+                        backup_vocals_track_input,
+                        pitch_shift_background_dir,
+                        pitch_change_semitones_background,
+                    ],
+                    outputs=[
+                        shifted_instrumentals_track_output,
+                        shifted_backup_vocals_track_output,
+                    ],
+                )
+            ] + [
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[dropdown, output_track],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                )
+                for dropdown, output_track in zip(
+                    [
+                        shifted_instrumentals_track_transfer_dropdown,
+                        shifted_backup_vocals_track_transfer_dropdown,
+                    ],
+                    [
+                        shifted_instrumentals_track_output,
+                        shifted_backup_vocals_track_output,
+                    ],
+                )
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                pitch_shift_background_btn,
+                pitch_shift_background_event_args_list,
+                generate_buttons,
+            )
+        with gr.Accordion("Step 7: song mixing", open=False):
+            gr.Markdown("")
+            gr.Markdown("**Inputs**")
+            with gr.Row():
+                postprocessed_vocals_track_input.render()
+                shifted_instrumentals_track_input.render()
+                shifted_backup_vocals_track_input.render()
+            mix_dir.render()
+            with gr.Row():
+                main_gain = gr.Slider(-20, 20, value=0, step=1, label="Main vocals")
+                inst_gain = gr.Slider(-20, 20, value=0, step=1, label="Instrumentals")
+                backup_gain = gr.Slider(-20, 20, value=0, step=1, label="Backup vocals")
+            with gr.Row():
+                output_name = gr.Textbox(
+                    label="Output file name",
+                    placeholder="Ultimate RVC song cover",
+                )
+                output_sr = gr.Dropdown(
+                    choices=[16000, 44100, 48000, 96000, 192000],
+                    value=44100,
+                    label="Output sample rate",
+                )
+                output_format = gr.Dropdown(
+                    ["mp3", "wav", "flac", "aac", "m4a", "ogg"],
+                    value="mp3",
+                    label="Output file format",
+                )
+                postprocessed_vocals_track_input.change(
+                    update_song_cover_name,
+                    inputs=[postprocessed_vocals_track_input, mix_dir],
+                    outputs=output_name,
+                    show_progress="hidden",
+                )
+                mix_dir.change(
+                    update_song_cover_name,
+                    inputs=[postprocessed_vocals_track_input, mix_dir],
+                    outputs=output_name,
+                    show_progress="hidden",
+                )
+            gr.Markdown("**Outputs**")
+            song_cover_track.render()
+            song_cover_track_transfer_dropdown.render()
+            mix_clear_btn.render()
+            mix_clear_btn.click(
+                lambda: [
+                    0,
+                    0,
+                    0,
+                    44100,
+                    "mp3",
+                    gr.Dropdown(value=song_cover_track_transfer_default),
+                ],
+                outputs=[
+                    main_gain,
+                    inst_gain,
+                    backup_gain,
+                    output_sr,
+                    output_format,
+                    song_cover_track_transfer_dropdown,
+                ],
+                show_progress="hidden",
+            )
+            mix_btn.render()
+            mix_btn_event_args_list = [
+                EventArgs(
+                    partial(
+                        exception_harness(mix_song_cover), progress_bar=PROGRESS_BAR
+                    ),
+                    inputs=[
+                        postprocessed_vocals_track_input,
+                        shifted_instrumentals_track_input,
+                        shifted_backup_vocals_track_input,
+                        mix_dir,
+                        main_gain,
+                        inst_gain,
+                        backup_gain,
+                        output_sr,
+                        output_format,
+                        output_name,
+                    ],
+                    outputs=[song_cover_track],
+                ),
+                EventArgs(
+                    partial(update_output_audio, 1, [], [0]),
+                    outputs=[output_audio_to_remove],
+                    name="then",
+                    show_progress="hidden",
+                ),
+                EventArgs(
+                    partial(_update_audio, len(input_tracks)),
+                    inputs=[song_cover_track_transfer_dropdown, song_cover_track],
+                    outputs=input_tracks,
+                    name="then",
+                    show_progress="hidden",
+                ),
+            ]
+            setup_consecutive_event_listeners_with_toggled_interactivity(
+                mix_btn, mix_btn_event_args_list, generate_buttons
+            )

src/frontend/tabs/one_click_generation.py ADDED Viewed

	@@ -0,0 +1,573 @@

+"""
+This module contains the code for the "One-click generation" tab.
+"""
+from typings.extra import RunPipelineHarnessArgs
+from functools import partial
+import gradio as gr
+from backend.generate_song_cover import run_pipeline
+from frontend.common import (
+    PROGRESS_BAR,
+    EventArgs,
+    exception_harness,
+    setup_consecutive_event_listeners_with_toggled_interactivity,
+    show_hop_slider,
+    toggle_visible_component,
+    update_cached_input_songs,
+    update_output_audio,
+    update_song_cover_name,
+    update_value,
+)
+def _run_pipeline_harness(*args: *RunPipelineHarnessArgs) -> tuple[str | None, ...]:
+    """
+    Run the song cover generation pipeline in a harness
+    which displays a progress bar, re-raises exceptions as Gradio errors,
+    and returns the output of the pipeline.
+    If the pipeline outputs only a single path,
+    then that output is extended with a None value for each intermediate audio file.
+    Parameters
+    ----------
+    *args : *RunPipelineHarnessArgs
+        Arguments to forward to the pipeline.
+    Returns
+    -------
+    tuple[str | None, ...]
+        The output of the pipeline, potentially extended with None values.
+    """
+    res = exception_harness(run_pipeline)(*args, progress_bar=PROGRESS_BAR)
+    if isinstance(res, tuple):
+        return res
+    else:
+        return (None,) * 11 + (res,)
+def _toggle_intermediate_files_accordion(
+    visible: bool,
+) -> list[gr.Accordion | gr.Audio]:
+    """
+    Toggle the visibility of intermediate audio file accordions
+    and their associated audio components.
+    Parameters
+    ----------
+    visible : bool
+        Visibility status of the accordions and audio components.
+    Returns
+    -------
+    list[gr.Accordion | gr.Audio]
+        The accordions and audio components with updated visibility.
+    """
+    audio_components = [gr.Audio(value=None) for _ in range(11)]
+    accordions = [gr.Accordion(open=False) for _ in range(7)]
+    return [gr.Accordion(visible=visible, open=False)] + accordions + audio_components
+def render(
+    generate_buttons: list[gr.Button],
+    song_dir_dropdowns: list[gr.Dropdown],
+    cached_input_songs_dropdown_1click: gr.Dropdown,
+    cached_input_songs_dropdown_multi: gr.Dropdown,
+    rvc_model: gr.Dropdown,
+    intermediate_audio_to_delete: gr.Dropdown,
+    output_audio_to_delete: gr.Dropdown,
+) -> None:
+    """
+    Render "One-click generation" tab.
+    Parameters
+    ----------
+    generate_buttons : list[gr.Button]
+        Buttons used for audio generation in the
+        "One-click generation" tab and the "Multi-step generation" tab.
+    song_dir_dropdowns : list[gr.Dropdown]
+        Dropdowns for selecting song directories in the
+        "Multi-step generation" tab.
+    cached_input_songs_dropdown_1click : gr.Dropdown
+        Dropdown for selecting cached input songs in the
+        "One-click generation" tab
+    cached_input_songs_dropdown_multi : gr.Dropdown
+        Dropdown for selecting cached input songs in the
+        "Multi-step generation" tab
+    rvc_model : gr.Dropdown
+        Dropdown for selecting RVC model in the
+        "One-click generation" tab.
+    intermediate_audio_to_delete : gr.Dropdown
+        Dropdown for selecting intermediate audio files to delete in the
+        "Manage audio" tab.
+    output_audio_to_delete : gr.Dropdown
+        Dropdown for selecting output audio files to delete in the
+        "Manage audio" tab.
+    """
+    with gr.Tab("One-click generation"):
+        (
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            generate_btn,
+        ) = generate_buttons
+        with gr.Accordion("Main options"):
+            with gr.Row():
+                with gr.Column():
+                    song_input_type_dropdown = gr.Dropdown(
+                        ["YouTube link/local path", "Local file", "Cached song"],
+                        value="YouTube link/local path",
+                        label="Song input type",
+                        type="index",
+                    )
+                    song_input = gr.Textbox(
+                        label="Song input",
+                        info=(
+                            "Link to a song on YouTube or the full path of a local"
+                            " audio file."
+                        ),
+                    )
+                    local_file = gr.Audio(
+                        label="Song input", type="filepath", visible=False
+                    )
+                    cached_input_songs_dropdown_1click.render()
+                    song_input_type_dropdown.input(
+                        partial(toggle_visible_component, 3),
+                        inputs=song_input_type_dropdown,
+                        outputs=[
+                            song_input,
+                            local_file,
+                            cached_input_songs_dropdown_1click,
+                        ],
+                        show_progress="hidden",
+                    )
+                    local_file.change(
+                        update_value,
+                        inputs=local_file,
+                        outputs=song_input,
+                        show_progress="hidden",
+                    )
+                    cached_input_songs_dropdown_1click.input(
+                        update_value,
+                        inputs=cached_input_songs_dropdown_1click,
+                        outputs=song_input,
+                        show_progress="hidden",
+                    )
+                with gr.Column():
+                    rvc_model.render()
+                with gr.Column():
+                    pitch_change_vocals = gr.Slider(
+                        -3,
+                        3,
+                        value=0,
+                        step=1,
+                        label="Pitch shift of vocals",
+                        info=(
+                            "Shift pitch of converted vocals. Measured in octaves."
+                            " Generally, use 1 for male-to-female conversions and -1"
+                            " for vice-versa."
+                        ),
+                    )
+                    pitch_change_all = gr.Slider(
+                        -12,
+                        12,
+                        value=0,
+                        step=1,
+                        label="Overall pitch shift",
+                        info=(
+                            "Shift pitch of converted vocals, backup vocals and"
+                            " instrumentals. Measured in semi-tones. Altering this"
+                            " slightly reduces sound quality."
+                        ),
+                    )
+        with gr.Accordion("Vocal conversion options", open=False):
+            with gr.Row():
+                index_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.5,
+                    label="Index rate",
+                    info=(
+                        "Controls how much of the accent in the voice model to keep in"
+                        " the converted vocals"
+                    ),
+                )
+                filter_radius = gr.Slider(
+                    0,
+                    7,
+                    value=3,
+                    step=1,
+                    label="Filter radius",
+                    info=(
+                        "If >=3: apply median filtering to the harvested pitch results."
+                        " Can reduce breathiness"
+                    ),
+                )
+                rms_mix_rate = gr.Slider(
+                    0,
+                    1,
+                    value=0.25,
+                    label="RMS mix rate",
+                    info=(
+                        "Control how much to mimic the loudness (0) of the input vocals"
+                        " or a fixed loudness (1)"
+                    ),
+                )
+                protect = gr.Slider(
+                    0,
+                    0.5,
+                    value=0.33,
+                    label="Protect rate",
+                    info=(
+                        "Protect voiceless consonants and breath sounds. Set to 0.5 to"
+                        " disable."
+                    ),
+                )
+                with gr.Column():
+                    f0_method = gr.Dropdown(
+                        ["rmvpe", "mangio-crepe"],
+                        value="rmvpe",
+                        label="Pitch detection algorithm",
+                        info=(
+                            "Best option is rmvpe (clarity in vocals), then"
+                            " mangio-crepe (smoother vocals)"
+                        ),
+                    )
+                    crepe_hop_length = gr.Slider(
+                        32,
+                        320,
+                        value=128,
+                        step=1,
+                        visible=False,
+                        label="Crepe hop length",
+                        info=(
+                            "Lower values leads to longer conversions and higher risk"
+                            " of voice cracks, but better pitch accuracy."
+                        ),
+                    )
+                    f0_method.change(
+                        show_hop_slider,
+                        inputs=f0_method,
+                        outputs=crepe_hop_length,
+                        show_progress="hidden",
+                    )
+        with gr.Accordion("Audio mixing options", open=False):
+            gr.Markdown("")
+            gr.Markdown("### Reverb control on converted vocals")
+            with gr.Row():
+                reverb_rm_size = gr.Slider(
+                    0,
+                    1,
+                    value=0.15,
+                    label="Room size",
+                    info="The larger the room, the longer the reverb time",
+                )
+                reverb_wet = gr.Slider(
+                    0,
+                    1,
+                    value=0.2,
+                    label="Wetness level",
+                    info="Loudness level of converted vocals with reverb",
+                )
+                reverb_dry = gr.Slider(
+                    0,
+                    1,
+                    value=0.8,
+                    label="Dryness level",
+                    info="Loudness level of converted vocals without reverb",
+                )
+                reverb_damping = gr.Slider(
+                    0,
+                    1,
+                    value=0.7,
+                    label="Damping level",
+                    info="Absorption of high frequencies in the reverb",
+                )
+            gr.Markdown("")
+            gr.Markdown("### Volume controls (dB)")
+            with gr.Row():
+                main_gain = gr.Slider(-20, 20, value=0, step=1, label="Main vocals")
+                inst_gain = gr.Slider(-20, 20, value=0, step=1, label="Instrumentals")
+                backup_gain = gr.Slider(-20, 20, value=0, step=1, label="Backup vocals")
+        with gr.Accordion("Audio output options", open=False):
+            with gr.Row():
+                output_name = gr.Textbox(
+                    label="Output file name",
+                    info=(
+                        "If no name is provided, a suitable name will be generated"
+                        " automatically."
+                    ),
+                    placeholder="Ultimate RVC song cover",
+                )
+                output_sr = gr.Dropdown(
+                    choices=[16000, 44100, 48000, 96000, 192000],
+                    value=44100,
+                    label="Output sample rate",
+                )
+                output_format = gr.Dropdown(
+                    ["mp3", "wav", "flac", "aac", "m4a", "ogg"],
+                    value="mp3",
+                    label="Output file format",
+                )
+            with gr.Row():
+                show_intermediate_files = gr.Checkbox(
+                    label="Show intermediate audio files",
+                    value=False,
+                    info=(
+                        "Show generated intermediate audio files when song cover"
+                        " generation completes. Leave unchecked to optimize"
+                        " performance."
+                    ),
+                )
+            rvc_model.change(
+                partial(update_song_cover_name, None, update_placeholder=True),
+                inputs=[cached_input_songs_dropdown_1click, rvc_model],
+                outputs=output_name,
+                show_progress="hidden",
+            )
+            cached_input_songs_dropdown_1click.change(
+                partial(update_song_cover_name, None, update_placeholder=True),
+                inputs=[cached_input_songs_dropdown_1click, rvc_model],
+                outputs=output_name,
+                show_progress="hidden",
+            )
+        intermediate_audio_accordions = [
+            gr.Accordion(label, open=False, render=False)
+            for label in [
+                "Step 0: song retrieval",
+                "Step 1: vocals/instrumentals separation",
+                "Step 2: main vocals/ backup vocals separation",
+                "Step 3: main vocals cleanup",
+                "Step 4: conversion of main vocals",
+                "Step 5: post-processing of converted vocals",
+                "Step 6: pitch shift of background tracks",
+            ]
+        ]
+        (
+            song_retrieval_accordion,
+            vocals_separation_accordion,
+            main_vocals_separation_accordion,
+            vocal_cleanup_accordion,
+            vocal_conversion_accordion,
+            vocals_postprocessing_accordion,
+            pitch_shift_accordion,
+        ) = intermediate_audio_accordions
+        (
+            original_track,
+            vocals_track,
+            instrumentals_track,
+            main_vocals_track,
+            backup_vocals_track,
+            main_vocals_dereverbed_track,
+            main_vocals_reverb_track,
+            converted_vocals_track,
+            postprocessed_vocals_track,
+            instrumentals_shifted_track,
+            backup_vocals_shifted_track,
+        ) = [
+            gr.Audio(label=label, type="filepath", interactive=False, render=False)
+            for label in [
+                "Input song",
+                "Vocals",
+                "Instrumentals",
+                "Main vocals",
+                "Backup vocals",
+                "De-reverbed main vocals",
+                "Main vocals reverb",
+                "Converted vocals",
+                "Post-processed vocals",
+                "Pitch-shifted instrumentals",
+                "Pitch-shifted backup vocals",
+            ]
+        ]
+        with gr.Accordion(
+            "Access intermediate audio files", open=False, visible=False
+        ) as intermediate_files_accordion:
+            song_retrieval_accordion.render()
+            with song_retrieval_accordion:
+                original_track.render()
+            vocals_separation_accordion.render()
+            with vocals_separation_accordion:
+                with gr.Row():
+                    vocals_track.render()
+                    instrumentals_track.render()
+            main_vocals_separation_accordion.render()
+            with main_vocals_separation_accordion:
+                with gr.Row():
+                    main_vocals_track.render()
+                    backup_vocals_track.render()
+            vocal_cleanup_accordion.render()
+            with vocal_cleanup_accordion:
+                with gr.Row():
+                    main_vocals_dereverbed_track.render()
+                    main_vocals_reverb_track.render()
+            vocal_conversion_accordion.render()
+            with vocal_conversion_accordion:
+                converted_vocals_track.render()
+            vocals_postprocessing_accordion.render()
+            with vocals_postprocessing_accordion:
+                postprocessed_vocals_track.render()
+            pitch_shift_accordion.render()
+            with pitch_shift_accordion:
+                with gr.Row():
+                    instrumentals_shifted_track.render()
+                    backup_vocals_shifted_track.render()
+        with gr.Row():
+            clear_btn = gr.Button(value="Reset settings", scale=2)
+            generate_btn.render()
+            song_cover_track = gr.Audio(label="Song cover", scale=3)
+        show_intermediate_files.change(
+            _toggle_intermediate_files_accordion,
+            inputs=show_intermediate_files,
+            outputs=[
+                intermediate_files_accordion,
+                song_retrieval_accordion,
+                vocals_separation_accordion,
+                main_vocals_separation_accordion,
+                vocal_cleanup_accordion,
+                vocal_conversion_accordion,
+                vocals_postprocessing_accordion,
+                pitch_shift_accordion,
+                original_track,
+                vocals_track,
+                instrumentals_track,
+                main_vocals_track,
+                backup_vocals_track,
+                main_vocals_dereverbed_track,
+                main_vocals_reverb_track,
+                converted_vocals_track,
+                postprocessed_vocals_track,
+                instrumentals_shifted_track,
+                backup_vocals_shifted_track,
+            ],
+            show_progress="hidden",
+        )
+        generate_event_args_list = [
+            EventArgs(
+                _run_pipeline_harness,
+                inputs=[
+                    song_input,
+                    rvc_model,
+                    pitch_change_vocals,
+                    pitch_change_all,
+                    index_rate,
+                    filter_radius,
+                    rms_mix_rate,
+                    protect,
+                    f0_method,
+                    crepe_hop_length,
+                    reverb_rm_size,
+                    reverb_wet,
+                    reverb_dry,
+                    reverb_damping,
+                    main_gain,
+                    inst_gain,
+                    backup_gain,
+                    output_sr,
+                    output_format,
+                    output_name,
+                    show_intermediate_files,
+                ],
+                outputs=[
+                    original_track,
+                    vocals_track,
+                    instrumentals_track,
+                    main_vocals_track,
+                    backup_vocals_track,
+                    main_vocals_dereverbed_track,
+                    main_vocals_reverb_track,
+                    converted_vocals_track,
+                    postprocessed_vocals_track,
+                    instrumentals_shifted_track,
+                    backup_vocals_shifted_track,
+                    song_cover_track,
+                ],
+            ),
+            EventArgs(
+                partial(
+                    update_cached_input_songs, 3 + len(song_dir_dropdowns), [], [1]
+                ),
+                outputs=[
+                    cached_input_songs_dropdown_1click,
+                    intermediate_audio_to_delete,
+                    cached_input_songs_dropdown_multi,
+                ]
+                + song_dir_dropdowns,
+                name="then",
+                show_progress="hidden",
+            ),
+            EventArgs(
+                partial(update_output_audio, 1, [], [0]),
+                outputs=[output_audio_to_delete],
+                name="then",
+                show_progress="hidden",
+            ),
+        ]
+        setup_consecutive_event_listeners_with_toggled_interactivity(
+            generate_btn,
+            generate_event_args_list,
+            generate_buttons + [show_intermediate_files],
+        )
+        clear_btn.click(
+            lambda: [
+                0,
+                0,
+                0.5,
+                3,
+                0.25,
+                0.33,
+                "rmvpe",
+                128,
+                0.15,
+                0.2,
+                0.8,
+                0.7,
+                0,
+                0,
+                0,
+                44100,
+                "mp3",
+                False,
+            ],
+            outputs=[
+                pitch_change_vocals,
+                pitch_change_all,
+                index_rate,
+                filter_radius,
+                rms_mix_rate,
+                protect,
+                f0_method,
+                crepe_hop_length,
+                reverb_rm_size,
+                reverb_wet,
+                reverb_dry,
+                reverb_damping,
+                main_gain,
+                inst_gain,
+                backup_gain,
+                output_sr,
+                output_format,
+                show_intermediate_files,
+            ],
+            show_progress="hidden",
+        )

src/init.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+This script downloads the models required for running the Ultimmate RVC app.
+"""
+import os
+import requests
+from common import RVC_MODELS_DIR
+RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
+def dl_model(link: str, model_name: str, dir_name: str) -> None:
+    """
+    Download a model from a link and save it to a directory.
+    Parameters
+    ----------
+    link : str
+        The link to the site where the model is hosted.
+    model_name : str
+        The name of the model to download.
+    dir_name : str
+        The directory to save the model to.
+    """
+    with requests.get(f"{link}{model_name}") as r:
+        r.raise_for_status()
+        with open(os.path.join(dir_name, model_name), "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+if __name__ == "__main__":
+    rvc_model_names = ["hubert_base.pt", "rmvpe.pt"]
+    for model in rvc_model_names:
+        print(f"Downloading {model}...")
+        dl_model(RVC_DOWNLOAD_LINK, model, RVC_MODELS_DIR)
+    print("All models downloaded!")

src/typings/audio_separator/separator/__init__.pyi ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import TypedDict
+import logging
+class MDXParams(TypedDict):
+    hop_length: int
+    segment_size: int
+    overlap: float
+    batch_size: int
+    enable_denoise: bool
+class VRParams(TypedDict):
+    batch_size: int
+    window_size: int
+    aggression: int
+    enable_tta: bool
+    enable_post_process: bool
+    post_process_threshold: float
+    high_end_process: bool
+class DemucsParams(TypedDict):
+    segment_size: str
+    shifts: int
+    overlap: float
+    segments_enabled: bool
+class MDXCParams(TypedDict):
+    segment_size: int
+    batch_size: int
+    overlap: int
+class ArchSpecificParams(TypedDict):
+    MDX: MDXParams
+    VR: VRParams
+    Demucs: DemucsParams
+    MDXC: MDXCParams
+class Separator:
+    arch_specific_params: ArchSpecificParams
+    def __init__(
+        self,
+        log_level: int = logging.INFO,
+        log_formatter: logging.Formatter | None = None,
+        model_file_dir: str = "/tmp/audio-separator-models/",
+        output_dir: str | None = None,
+        output_format: str = "WAV",
+        normalization_threshold: float = 0.9,
+        output_single_stem: str | None = None,
+        invert_using_spec: bool = False,
+        sample_rate: int = 44100,
+        mdx_params: MDXParams = {
+            "hop_length": 1024,
+            "segment_size": 256,
+            "overlap": 0.25,
+            "batch_size": 1,
+            "enable_denoise": False,
+        },
+        vr_params: VRParams = {
+            "batch_size": 16,
+            "window_size": 512,
+            "aggression": 5,
+            "enable_tta": False,
+            "enable_post_process": False,
+            "post_process_threshold": 0.2,
+            "high_end_process": False,
+        },
+        demucs_params: DemucsParams = {
+            "segment_size": "Default",
+            "shifts": 2,
+            "overlap": 0.25,
+            "segments_enabled": True,
+        },
+        mdxc_params: MDXCParams = {"segment_size": 256, "batch_size": 1, "overlap": 8},
+    ) -> None: ...
+    def load_model(
+        self, model_filename: str = "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt"
+    ) -> None: ...
+    def separate(self, audio_file_path: str) -> list[str]: ...

src/typings/extra.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import Any, Callable, Literal, ParamSpec, Sequence, TypedDict, TypeVar
+from os import PathLike
+P = ParamSpec("P")
+T = TypeVar("T")
+StrOrBytesPath = str | bytes | PathLike[str] | PathLike[bytes]
+DropdownChoices = Sequence[str | int | float | tuple[str, str | int | float]] | None
+DropdownValue = (
+    str | int | float | Sequence[str | int | float] | Callable[..., Any] | None
+)
+InputType = Literal["yt", "local"]
+F0Method = Literal["rmvpe", "mangio-crepe"]
+InputAudioExt = Literal["mp3", "wav", "flac", "aac", "m4a", "ogg"]
+OutputAudioExt = Literal["mp3", "wav", "flac", "adts", "ipod", "ogg"]
+ModelsTable = list[list[str]]
+ModelsTablePredicate = Callable[[dict[str, str | list[str]]], bool]
+class ComponentVisibilityKwArgs(TypedDict):
+    visible: bool
+    value: Any
+class UpdateDropdownArgs(TypedDict, total=False):
+    choices: DropdownChoices | None
+    value: DropdownValue | None
+class TextBoxArgs(TypedDict, total=False):
+    value: str | None
+    placeholder: str | None
+class TransferUpdateArgs(TypedDict, total=False):
+    value: str | None
+RunPipelineHarnessArgs = tuple[
+    str,  # song_input
+    str,  # voice_model
+    int,  # pitch_change_vocals
+    int,  # pitch_change_all
+    float,  # index_rate
+    int,  # filter_radius
+    float,  # rms_mix_rate
+    float,  # protect
+    F0Method,  # f0_method
+    int,  # crepe_hop_length
+    float,  # reverb_rm_size
+    float,  # reverb_wet
+    float,  # reverb_dry
+    float,  # reverb_damping
+    int,  # main_gain
+    int,  # inst_gain
+    int,  # backup_gain
+    int,  # output_sr
+    InputAudioExt,  # output_format
+    str,  # output_name
+    bool,  # return_files
+]

src/typings/gradio/__init__.pyi ADDED Viewed

	@@ -0,0 +1,238 @@

+from gradio import (
+    _simple_templates,
+    components,
+    layouts,
+    processing_utils,
+    templates,
+    themes,
+)
+from gradio.blocks import Blocks
+from gradio.chat_interface import ChatInterface
+from gradio.components import (
+    HTML,
+    JSON,
+    AnnotatedImage,
+    Annotatedimage,
+    Audio,
+    BarPlot,
+    Button,
+    Chatbot,
+    ChatMessage,
+    Checkbox,
+    CheckboxGroup,
+    Checkboxgroup,
+    ClearButton,
+    Code,
+    ColorPicker,
+    DataFrame,
+    Dataframe,
+    Dataset,
+    DateTime,
+    DownloadButton,
+    Dropdown,
+    DuplicateButton,
+    File,
+    FileExplorer,
+    Gallery,
+    Highlight,
+    HighlightedText,
+    Highlightedtext,
+    Image,
+    ImageEditor,
+    Json,
+    Label,
+    LinePlot,
+    LoginButton,
+    LogoutButton,
+    Markdown,
+    MessageDict,
+    Model3D,
+    MultimodalTextbox,
+    Number,
+    ParamViewer,
+    Plot,
+    Radio,
+    ScatterPlot,
+    Slider,
+    State,
+    Text,
+    Textbox,
+    Timer,
+    UploadButton,
+    Video,
+    component,
+)
+from gradio.components.audio import WaveformOptions
+from gradio.components.image_editor import Brush, Eraser
+from gradio.data_classes import FileData
+from gradio.events import (
+    DeletedFileData,
+    EventData,
+    KeyUpData,
+    LikeData,
+    SelectData,
+    on,
+)
+from gradio.exceptions import Error
+from gradio.external import load
+from gradio.flagging import (
+    CSVLogger,
+    FlaggingCallback,
+    HuggingFaceDatasetSaver,
+    SimpleCSVLogger,
+)
+from gradio.helpers import (
+    Info,
+    Progress,
+    Warning,
+    make_waveform,
+    skip,
+    update,
+)
+from gradio.helpers import create_examples as Examples  # noqa: N812
+from gradio.interface import Interface, TabbedInterface, close_all
+from gradio.layouts import Accordion, Column, Group, Row, Tab, TabItem, Tabs
+from gradio.oauth import OAuthProfile, OAuthToken
+from gradio.renderable import render
+from gradio.routes import Request, mount_gradio_app
+from gradio.templates import (
+    Files,
+    ImageMask,
+    List,
+    Matrix,
+    Mic,
+    Microphone,
+    Numpy,
+    Paint,
+    PlayableVideo,
+    Sketchpad,
+    TextArea,
+)
+from gradio.themes import Base as Theme
+from gradio.utils import NO_RELOAD, FileSize, get_package_version, set_static_paths
+from gradio.wasm_utils import IS_WASM
+if not IS_WASM:
+    from gradio.cli import deploy
+    from gradio.ipython_ext import load_ipython_extension
+__version__ = ...
+__all__ = [
+    "_simple_templates",
+    "templates",
+    "processing_utils",
+    "components",
+    "layouts",
+    "themes",
+    "Blocks",
+    "ChatInterface",
+    "HTML",
+    "JSON",
+    "AnnotatedImage",
+    "Annotatedimage",
+    "Audio",
+    "BarPlot",
+    "Button",
+    "Chatbot",
+    "ChatMessage",
+    "Checkbox",
+    "CheckboxGroup",
+    "Checkboxgroup",
+    "ClearButton",
+    "Code",
+    "ColorPicker",
+    "DataFrame",
+    "Dataframe",
+    "Dataset",
+    "DateTime",
+    "DownloadButton",
+    "Dropdown",
+    "DuplicateButton",
+    "File",
+    "FileExplorer",
+    "Gallery",
+    "Highlight",
+    "HighlightedText",
+    "Highlightedtext",
+    "Image",
+    "ImageEditor",
+    "Json",
+    "Label",
+    "LinePlot",
+    "LoginButton",
+    "LogoutButton",
+    "Markdown",
+    "MessageDict",
+    "Model3D",
+    "MultimodalTextbox",
+    "Number",
+    "ParamViewer",
+    "Plot",
+    "Radio",
+    "ScatterPlot",
+    "Slider",
+    "State",
+    "Text",
+    "Textbox",
+    "Timer",
+    "UploadButton",
+    "Video",
+    "component",
+    "WaveformOptions",
+    "Brush",
+    "Eraser",
+    "FileData",
+    "DeletedFileData",
+    "EventData",
+    "KeyUpData",
+    "LikeData",
+    "SelectData",
+    "on",
+    "Error",
+    "load",
+    "CSVLogger",
+    "FlaggingCallback",
+    "HuggingFaceDatasetSaver",
+    "SimpleCSVLogger",
+    "Info",
+    "Progress",
+    "Warning",
+    "make_waveform",
+    "skip",
+    "update",
+    "Examples",
+    "Interface",
+    "TabbedInterface",
+    "close_all",
+    "Accordion",
+    "Column",
+    "Group",
+    "Row",
+    "Tab",
+    "TabItem",
+    "Tabs",
+    "OAuthProfile",
+    "OAuthToken",
+    "render",
+    "Request",
+    "mount_gradio_app",
+    "Files",
+    "ImageMask",
+    "List",
+    "Matrix",
+    "Mic",
+    "Microphone",
+    "Numpy",
+    "Paint",
+    "PlayableVideo",
+    "Sketchpad",
+    "TextArea",
+    "Theme",
+    "NO_RELOAD",
+    "FileSize",
+    "get_package_version",
+    "set_static_paths",
+    "IS_WASM",
+    "deploy",
+    "load_ipython_extension",
+]

src/typings/gradio/events.pyi ADDED Viewed

	@@ -0,0 +1,374 @@

+from __future__ import annotations
+from typing import (
+    TYPE_CHECKING,
+    AbstractSet,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Self,
+    Sequence,
+    Union,
+)
+import dataclasses
+from gradio.data_classes import FileData, FileDataDict
+if TYPE_CHECKING:
+    from gradio.blocks import Block, BlockContext, Component
+    from gradio.components import Timer
+def set_cancel_events(
+    triggers: Sequence[EventListenerMethod],
+    cancels: None | dict[str, Any] | list[dict[str, Any]],
+) -> None: ...
+class Dependency(dict[Any, Any]):
+    fn: Callable[..., Any]
+    associated_timer: Timer | None
+    then: Callable[..., Any]
+    success: Callable[..., Any]
+    def __init__(
+        self,
+        trigger: Any,
+        key_vals: Any,
+        dep_index: int,
+        fn: Callable[..., Any],
+        associated_timer: Timer | None = ...,
+    ) -> None:
+        """
+        The Dependency object is usualy not created directly but is returned when an event listener is set up. It contains the configuration
+        data for the event listener, and can be used to set up additional event listeners that depend on the completion of the current event
+        listener using .then() and .success().
+        Demos: chatbot_consecutive, blocks_chained_events
+        """
+        ...
+    def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
+class EventData:
+    """
+    When gr.EventData or one of its subclasses is added as a type hint to an argument of a prediction function, a gr.EventData object will automatically be passed as the value of that argument.
+    The attributes of this object contains information about the event that triggered the listener. The gr.EventData object itself contains a `.target` attribute that refers to the component
+    that triggered the event, while subclasses of gr.EventData contains additional attributes that are different for each class.
+    Example:
+        import gradio as gr
+        with gr.Blocks() as demo:
+            table = gr.Dataframe([[1, 2, 3], [4, 5, 6]])
+            gallery = gr.Gallery([("cat.jpg", "Cat"), ("dog.jpg", "Dog")])
+            textbox = gr.Textbox("Hello World!")
+            statement = gr.Textbox()
+            def on_select(value, evt: gr.EventData):
+                return f"The {evt.target} component was selected, and its value was {value}."
+            table.select(on_select, table, statement)
+            gallery.select(on_select, gallery, statement)
+            textbox.select(on_select, textbox, statement)
+        demo.launch()
+    Demos: gallery_selections, tictactoe
+    """
+    target: Block | None
+    _data: Any
+    def __init__(self, target: Block | None, _data: Any) -> None:
+        """
+        Parameters:
+            target: The component object that triggered the event. Can be used to distinguish multiple components bound to the same listener.
+        """
+        ...
+class SelectData(EventData):
+    """
+    The gr.SelectData class is a subclass of gr.EventData that specifically carries information about the `.select()` event. When gr.SelectData
+    is added as a type hint to an argument of an event listener method, a gr.SelectData object will automatically be passed as the value of that argument.
+    The attributes of this object contains information about the event that triggered the listener.
+    Example:
+        import gradio as gr
+        with gr.Blocks() as demo:
+            table = gr.Dataframe([[1, 2, 3], [4, 5, 6]])
+            gallery = gr.Gallery([("cat.jpg", "Cat"), ("dog.jpg", "Dog")])
+            textbox = gr.Textbox("Hello World!")
+            statement = gr.Textbox()
+            def on_select(evt: gr.SelectData):
+                return f"You selected {evt.value} at {evt.index} from {evt.target}"
+            table.select(on_select, table, statement)
+            gallery.select(on_select, gallery, statement)
+            textbox.select(on_select, textbox, statement)
+        demo.launch()
+    Demos: gallery_selections, tictactoe
+    """
+    index: int | tuple[int, int]
+    value: Any
+    row_value: list[Any] | None
+    col_value: list[Any] | None
+    selected: bool
+    def __init__(self, target: Block | None, data: Any) -> None: ...
+class KeyUpData(EventData):
+    """
+    The gr.KeyUpData class is a subclass of gr.EventData that specifically carries information about the `.key_up()` event. When gr.KeyUpData
+    is added as a type hint to an argument of an event listener method, a gr.KeyUpData object will automatically be passed as the value of that argument.
+    The attributes of this object contains information about the event that triggered the listener.
+    Example:
+        import gradio as gr
+        def test(value, key_up_data: gr.KeyUpData):
+            return {
+                "component value": value,
+                "input value": key_up_data.input_value,
+                "key": key_up_data.key
+            }
+        with gr.Blocks() as demo:
+            d = gr.Dropdown(["abc", "def"], allow_custom_value=True)
+            t = gr.JSON()
+            d.key_up(test, d, t)
+        demo.launch()
+    Demos: dropdown_key_up
+    """
+    key: str
+    input_value: str
+    def __init__(self, target: Block | None, data: Any) -> None: ...
+class DeletedFileData(EventData):
+    """
+    The gr.DeletedFileData class is a subclass of gr.EventData that specifically carries information about the `.delete()` event. When gr.DeletedFileData
+    is added as a type hint to an argument of an event listener method, a gr.DeletedFileData object will automatically be passed as the value of that argument.
+    The attributes of this object contains information about the event that triggered the listener.
+    Example:
+        import gradio as gr
+        def test(delete_data: gr.DeletedFileData):
+            return delete_data.file.path
+        with gr.Blocks() as demo:
+            files = gr.File(file_count="multiple")
+            deleted_file = gr.File()
+            files.delete(test, None, deleted_file)
+        demo.launch()
+    Demos: file_component_events
+    """
+    file: FileData
+    def __init__(self, target: Block | None, data: FileDataDict) -> None: ...
+class LikeData(EventData):
+    """
+    The gr.LikeData class is a subclass of gr.EventData that specifically carries information about the `.like()` event. When gr.LikeData
+    is added as a type hint to an argument of an event listener method, a gr.LikeData object will automatically be passed as the value of that argument.
+    The attributes of this object contains information about the event that triggered the listener.
+    Example:
+        import gradio as gr
+        def test(value, like_data: gr.LikeData):
+            return {
+                "chatbot_value": value,
+                "liked_message": like_data.value,
+                "liked_index": like_data.index,
+                "liked_or_disliked_as_bool": like_data.liked
+            }
+        with gr.Blocks() as demo:
+            c = gr.Chatbot([("abc", "def")])
+            t = gr.JSON()
+            c.like(test, c, t)
+        demo.launch()
+    Demos: chatbot_core_components_simple
+    """
+    index: int | tuple[int, int]
+    value: Any
+    liked: bool
+    def __init__(self, target: Block | None, data: Any) -> None: ...
+@dataclasses.dataclass
+class EventListenerMethod:
+    block: Block | None
+    event_name: str
+if TYPE_CHECKING:
+    EventListenerCallable = Callable[
+        [
+            Union[Callable[..., Any], None],
+            Union[Component, Sequence[Component], None],
+            Union[Block, Sequence[Block], Sequence[Component], Component, None],
+            Union[str, None, Literal[False]],
+            bool,
+            Literal["full", "minimal", "hidden"],
+            Union[bool, None],
+            bool,
+            int,
+            bool,
+            bool,
+            Union[Dict[str, Any], List[Dict[str, Any]], None],
+            Union[float, None],
+            Union[Literal["once", "multiple", "always_last"], None],
+            Union[str, None],
+            Union[int, None, Literal["default"]],
+            Union[str, None],
+            bool,
+        ],
+        Dependency,
+    ]
+class EventListener(str):
+    has_trigger: bool
+    config_data: Callable[..., dict[str, Any]]
+    event_name: str
+    show_progress: Literal["full", "minimal", "hidden"]
+    trigger_after: int | None
+    trigger_only_on_success: bool
+    callback: Callable[..., Any] | None
+    doc: str
+    listener: Callable[..., Dependency]
+    def __new__(cls, event_name: str, *_args: Any, **_kwargs: Any) -> Self: ...
+    def __init__(
+        self,
+        event_name: str,
+        has_trigger: bool = ...,
+        config_data: Callable[..., dict[str, Any]] = ...,
+        show_progress: Literal["full", "minimal", "hidden"] = ...,
+        callback: Callable[..., Any] | None = ...,
+        trigger_after: int | None = ...,
+        trigger_only_on_success: bool = ...,
+        doc: str = ...,
+    ) -> None: ...
+    def set_doc(self, component: str) -> None: ...
+    def copy(self) -> EventListener: ...
+    @staticmethod
+    def _setup(
+        _event_name: str,
+        _has_trigger: bool,
+        _show_progress: Literal["full", "minimal", "hidden"],
+        _callback: Callable[..., Any] | None,
+        _trigger_after: int | None,
+        _trigger_only_on_success: bool,
+    ) -> Callable[..., Dependency]: ...
+def on(
+    triggers: Sequence[EventListenerCallable] | EventListenerCallable | None = ...,
+    fn: Callable[..., Any] | None | Literal["decorator"] = ...,
+    inputs: (
+        Component
+        | BlockContext
+        | Sequence[Component | BlockContext]
+        | AbstractSet[Component | BlockContext]
+        | None
+    ) = ...,
+    outputs: (
+        Component
+        | BlockContext
+        | Sequence[Component | BlockContext]
+        | AbstractSet[Component | BlockContext]
+        | None
+    ) = ...,
+    *,
+    api_name: str | None | Literal[False] = ...,
+    scroll_to_output: bool = ...,
+    show_progress: Literal["full", "minimal", "hidden"] = ...,
+    queue: bool = ...,
+    batch: bool = ...,
+    max_batch_size: int = ...,
+    preprocess: bool = ...,
+    postprocess: bool = ...,
+    cancels: dict[str, Any] | list[dict[str, Any]] | None = ...,
+    trigger_mode: Literal["once", "multiple", "always_last"] | None = ...,
+    every: float | None = ...,
+    js: str | None = ...,
+    concurrency_limit: int | None | Literal["default"] = ...,
+    concurrency_id: str | None = ...,
+    show_api: bool = ...,
+) -> Dependency:
+    """
+    Sets up an event listener that triggers a function when the specified event(s) occur. This is especially
+    useful when the same function should be triggered by multiple events. Only a single API endpoint is generated
+    for all events in the triggers list.
+    Parameters:
+        triggers: List of triggers to listen to, e.g. [btn.click, number.change]. If None, will listen to changes to any inputs.
+        fn: the function to call when this event is triggered. Often a machine learning model's prediction function. Each parameter of the function corresponds to one input component, and the function should return a single value or a tuple of values, with each element in the tuple corresponding to one output component.
+        inputs: List of gradio.components to use as inputs. If the function takes no inputs, this should be an empty list.
+        outputs: List of gradio.components to use as outputs. If the function returns no outputs, this should be an empty list.
+        api_name: Defines how the endpoint appears in the API docs. Can be a string, None, or False. If False, the endpoint will not be exposed in the api docs. If set to None, the endpoint will be exposed in the api docs as an unnamed endpoint, although this behavior will be changed in Gradio 4.0. If set to a string, the endpoint will be exposed in the api docs with the given name.
+        scroll_to_output: If True, will scroll to output component on completion
+        show_progress: how to show the progress animation while event is running: "full" shows a spinner which covers the output component area as well as a runtime display in the upper right corner, "minimal" only shows the runtime display, "hidden" shows no progress animation at all
+        queue: If True, will place the request on the queue, if the queue has been enabled. If False, will not put this event on the queue, even if the queue has been enabled. If None, will use the queue setting of the gradio app.
+        batch: If True, then the function should process a batch of inputs, meaning that it should accept a list of input values for each parameter. The lists should be of equal length (and be up to length `max_batch_size`). The function is then *required* to return a tuple of lists (even if there is only 1 output component), with each list in the tuple corresponding to one output component.
+        max_batch_size: Maximum number of inputs to batch together if this is called from the queue (only relevant if batch=True)
+        preprocess: If False, will not run preprocessing of component data before running 'fn' (e.g. leaving it as a base64 string if this method is called with the `Image` component).
+        postprocess: If False, will not run postprocessing of component data before returning 'fn' output to the browser.
+        cancels: A list of other events to cancel when this listener is triggered. For example, setting cancels=[click_event] will cancel the click_event, where click_event is the return value of another components .click method. Functions that have not yet run (or generators that are iterating) will be cancelled, but functions that are currently running will be allowed to finish.
+        trigger_mode: If "once" (default for all events except `.change()`) would not allow any submissions while an event is pending. If set to "multiple", unlimited submissions are allowed while pending, and "always_last" (default for `.change()` and `.key_up()` events) would allow a second submission after the pending event is complete.
+        every: Will be deprecated in favor of gr.Timer. Run this event 'every' number of seconds while the client connection is open. Interpreted in seconds.
+        js: Optional frontend js method to run before running 'fn'. Input arguments for js method are values of 'inputs', return should be a list of values for output components.
+        concurrency_limit: If set, this is the maximum number of this event that can be running simultaneously. Can be set to None to mean no concurrency_limit (any number of this event can be running simultaneously). Set to "default" to use the default concurrency limit (defined by the `default_concurrency_limit` parameter in `Blocks.queue()`, which itself is 1 by default).
+        concurrency_id: If set, this is the id of the concurrency group. Events with the same concurrency_id will be limited by the lowest set concurrency_limit.
+        show_api: whether to show this event in the "view API" page of the Gradio app, or in the ".view_api()" method of the Gradio clients. Unlike setting api_name to False, setting show_api to False will still allow downstream apps as well as the Clients to use this event. If fn is None, show_api will automatically be set to False.
+    Example:
+        import gradio as gr
+        with gr.Blocks() as demo:
+            with gr.Row():
+                input = gr.Textbox()
+                button = gr.Button("Submit")
+            output = gr.Textbox()
+            gr.on(
+                triggers=[button.click, input.submit],
+                fn=lambda x: x,
+                inputs=[input],
+                outputs=[output]
+            )
+        demo.launch()
+    """
+    ...
+class Events:
+    change: EventListener
+    input: EventListener
+    click: EventListener
+    double_click: EventListener
+    submit: EventListener
+    edit: EventListener
+    clear: EventListener
+    play: EventListener
+    pause: EventListener
+    stop: EventListener
+    end: EventListener
+    start_recording: EventListener
+    pause_recording: EventListener
+    stop_recording: EventListener
+    focus: EventListener
+    blur: EventListener
+    upload: EventListener
+    release: EventListener
+    select: EventListener
+    stream: EventListener
+    like: EventListener
+    load: EventListener
+    key_up: EventListener
+    apply: EventListener
+    delete: EventListener
+    tick: EventListener
+__all__ = [
+    "set_cancel_events",
+    "Dependency",
+    "EventData",
+    "SelectData",
+    "KeyUpData",
+    "DeletedFileData",
+    "LikeData",
+    "EventListenerMethod",
+    "EventListener",
+    "on",
+    "Events",
+]

src/typings/pedalboard_native/io/__init__.pyi ADDED Viewed

	@@ -0,0 +1,39 @@

+from typing import Literal, overload
+import numpy as np
+from numpy.typing import NDArray
+class AudioFile:
+    @staticmethod
+    @overload
+    def __new__(
+        cls: object, filename: str, mode: Literal["r"] = "r"
+    ) -> ReadableAudioFile: ...
+    @staticmethod
+    @overload
+    def __new__(
+        cls: object,
+        filename: str,
+        mode: Literal["w"],
+        samplerate: float | None = None,
+        num_channels: int = 1,
+        bit_depth: int = 16,
+        quality: str | float | None = None,
+    ) -> WriteableAudioFile: ...
+class ReadableAudioFile(AudioFile):
+    def __enter__(self) -> ReadableAudioFile: ...
+    def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
+    def read(self, num_frames: float | int = 0) -> NDArray[np.float32]: ...
+    def tell(self) -> int: ...
+    @property
+    def frames(self) -> int: ...
+    @property
+    def num_channels(self) -> int: ...
+    @property
+    def samplerate(self) -> float | int: ...
+class WriteableAudioFile(AudioFile):
+    def __enter__(self) -> WriteableAudioFile: ...
+    def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
+    def write(self, samples: NDArray[...]) -> None: ...

src/typings/soundfile/__init__.pyi ADDED Viewed

	@@ -0,0 +1,34 @@

+from typing import Literal
+from os import PathLike
+import numpy as np
+from numpy.typing import NDArray
+DEFAULT_NDARRAY = NDArray[np.float64 | np.float32 | np.int32 | np.int16]
+def read(
+    file: int | str | PathLike[str] | PathLike[bytes],
+    frames: int = -1,
+    start: int = 0,
+    stop: int | None = None,
+    dtype: Literal["float64", "float32", "int32", "int16"] = "float64",
+    always_2d: bool = False,
+    fill_value: float | None = None,
+    out: DEFAULT_NDARRAY | None = None,
+    samplerate: int | None = None,
+    channels: int | None = None,
+    format: str | None = None,
+    subtype: str | None = None,
+    endian: Literal["FILE", "LITTLE", "BIG", "CPU"] | None = None,
+    closefd: bool | None = True,
+) -> tuple[DEFAULT_NDARRAY, int]: ...
+def write(
+    file: int | str | PathLike[str] | PathLike[bytes],
+    data: DEFAULT_NDARRAY,
+    samplerate: int,
+    subtype: str | None = None,
+    endian: Literal["FILE", "LITTLE", "BIG", "CPU"] | None = None,
+    format: str | None = None,
+    closefd: bool | None = True,
+) -> None: ...

src/typings/sox/__init__.pyi ADDED Viewed

	@@ -0,0 +1,15 @@

+from typing import Self
+from pathlib import Path
+from numpy.typing import NDArray
+class Transformer:
+    def pitch(self, n_semitones: float, quick: bool = False) -> Self: ...
+    def build_array(
+        self,
+        input_filepath: str | Path | None = None,
+        input_array: NDArray[...] | None = None,
+        sample_rate_in: float | None = None,
+        extra_args: list[str] | None = None,
+    ) -> NDArray[...]: ...

src/typings/yt_dlp/__init__.pyi ADDED Viewed

	@@ -0,0 +1,25 @@

+from typing import Any, Self
+class YoutubeDL:
+    def __init__(
+        self, params: dict[str, Any] | None = None, auto_init: bool = True
+    ) -> None: ...
+    def extract_info(
+        self,
+        url: str,
+        download: bool = True,
+        ie_key: str | None = None,
+        extra_info: dict[str, Any] | None = None,
+        process: bool = True,
+        force_generic_extractor: bool = False,
+    ) -> dict[str, Any]: ...
+    def prepare_filename(
+        self,
+        info_dict: dict[str, Any],
+        dir_type: str = "",
+        *,
+        outtmpl: str | None = None,
+        warn: bool = False,
+    ) -> str: ...
+    def __enter__(self) -> Self: ...
+    def __exit__(self, *args: Any) -> None: ...

src/vc/configs/32k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,4,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/vc/configs/32k_v2.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [20,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/vc/configs/40k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 40000,
+    "filter_length": 2048,
+    "hop_length": 400,
+    "win_length": 2048,
+    "n_mel_channels": 125,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/vc/configs/48k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 11520,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/vc/configs/48k_v2.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 17280,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [12,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [24,20,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

src/vc/infer_pack/attentions.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import copy
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from vc.infer_pack import commons
+from vc.infer_pack import modules
+from vc.infer_pack.modules import LayerNorm
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=10,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        proximal_bias=False,
+        proximal_init=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.drop = nn.Dropout(p_dropout)
+        self.self_attn_layers = nn.ModuleList()
+        self.norm_layers_0 = nn.ModuleList()
+        self.encdec_attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.self_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    proximal_bias=proximal_bias,
+                    proximal_init=proximal_init,
+                )
+            )
+            self.norm_layers_0.append(LayerNorm(hidden_channels))
+            self.encdec_attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                    causal=True,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, h, h_mask):
+        """
+        x: decoder input
+        h: encoder output
+        """
+        self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
+            device=x.device, dtype=x.dtype
+        )
+        encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.self_attn_layers[i](x, x, self_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_0[i](x + y)
+            y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x

src/vc/infer_pack/commons.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def kl_divergence(m_p, logs_p, m_q, logs_q):
+    """KL(P||Q)"""
+    kl = (logs_q - logs_p) - 0.5
+    kl += (
+        0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
+    )
+    return kl
+def rand_gumbel(shape):
+    """Sample from the Gumbel distribution, protect from overflows."""
+    uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
+    return -torch.log(-torch.log(uniform_samples))
+def rand_gumbel_like(x):
+    g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
+    return g
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def slice_segments2(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
+    position = torch.arange(length, dtype=torch.float)
+    num_timescales = channels // 2
+    log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+        num_timescales - 1
+    )
+    inv_timescales = min_timescale * torch.exp(
+        torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
+    )
+    scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
+    signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
+    signal = F.pad(signal, [0, 0, 0, channels % 2])
+    signal = signal.view(1, channels, length)
+    return signal
+def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return x + signal.to(dtype=x.dtype, device=x.device)
+def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
+    b, channels, length = x.size()
+    signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
+    return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def shift_1d(x):
+    x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
+    return x
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm

src/vc/infer_pack/models.py ADDED Viewed

	@@ -0,0 +1,1128 @@

+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from vc.infer_pack import modules
+from vc.infer_pack import attentions
+from vc.infer_pack import commons
+from vc.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from vc.infer_pack.commons import init_weights
+import numpy as np
+from vc.infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder768(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (
+                f0_buf / self.sampling_rate
+            ) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(
+                rad_values, 1
+            )  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        self.upp = np.prod(upsample_rates)
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+class SynthesizerTrnMs256NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs768NSFsid(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder768(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
+    ):  # 这里ds是id，[bs,1]
+        # print(1,pitch.shape)#[bs,t]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
+        pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
+        # print(-2,pitchf.shape,z_slice.shape)
+        o = self.dec(z_slice, pitchf, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs256NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            f0=False,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class SynthesizerTrnMs768NSFsid_nono(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder768(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            f0=False,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id，[bs,1]
+        g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        z_p = self.flow(z, y_mask, g=g)
+        z_slice, ids_slice = commons.rand_slice_segments(
+            z, y_lengths, self.segment_size
+        )
+        o = self.dec(z_slice, g=g)
+        return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
+    def infer(self, phone, phone_lengths, sid, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], g=g)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminatorV2, self).__init__()
+        # periods = [2, 3, 5, 7, 11, 17]
+        periods = [2, 3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap

src/vc/infer_pack/models_onnx.py ADDED Viewed

	@@ -0,0 +1,822 @@

+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from vc.infer_pack import modules
+from vc.infer_pack import attentions
+from vc.infer_pack import commons
+from vc.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from vc.infer_pack.commons import init_weights
+import numpy as np
+from vc.infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder768(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (
+                f0_buf / self.sampling_rate
+            ) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(
+                rad_values, 1
+            )  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        self.upp = np.prod(upsample_rates)
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+class SynthesizerTrnMsNSFsidM(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        if self.gin_channels == 256:
+            self.enc_p = TextEncoder256(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+            )
+        else:
+            self.enc_p = TextEncoder768(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+            )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        self.speaker_map = None
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def construct_spkmixmap(self, n_speaker):
+        self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
+        for i in range(n_speaker):
+            self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
+        self.speaker_map = self.speaker_map.unsqueeze(0)
+    def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
+        if self.speaker_map is not None:  # [N, S]  *  [S, B, 1, H]
+            g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1))  # [N, S, B, 1, 1]
+            g = g * self.speaker_map  # [N, S, B, 1, H]
+            g = torch.sum(g, dim=1)  # [N, 1, B, 1, H]
+            g = g.transpose(0, -1).transpose(0, -2).squeeze(0)  # [B, H, N]
+        else:
+            g = g.unsqueeze(0)
+            g = self.emb_g(g).transpose(1, 2)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class MultiPeriodDiscriminatorV2(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminatorV2, self).__init__()
+        # periods = [2, 3, 5, 7, 11, 17]
+        periods = [2, 3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap

src/vc/infer_pack/models_onnx_moess.py ADDED Viewed

	@@ -0,0 +1,853 @@

+import math, pdb, os
+from time import time as ttime
+import torch
+from torch import nn
+from torch.nn import functional as F
+from vc.infer_pack import modules
+from vc.infer_pack import attentions
+from vc.infer_pack import commons
+from vc.infer_pack.commons import init_weights, get_padding
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from vc.infer_pack.commons import init_weights
+import numpy as np
+from vc.infer_pack import commons
+class TextEncoder256(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return m, logs, x_mask
+class TextEncoder256Sim(nn.Module):
+    def __init__(
+        self,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        f0=True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.emb_phone = nn.Linear(256, hidden_channels)
+        self.lrelu = nn.LeakyReLU(0.1, inplace=True)
+        if f0 == True:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
+        self.encoder = attentions.Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+    def forward(self, phone, pitch, lengths):
+        if pitch == None:
+            x = self.emb_phone(phone)
+        else:
+            x = self.emb_phone(phone) + self.emb_pitch(pitch)
+        x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
+        x = self.lrelu(x)
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.encoder(x * x_mask, x_mask)
+        x = self.proj(x) * x_mask
+        return x, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+    def remove_weight_norm(self):
+        for i in range(self.n_flows):
+            self.flows[i * 2].remove_weight_norm()
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def forward(self, f0, upp):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0 = f0[:, None].transpose(1, 2)
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
+                    idx + 2
+                )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+            rad_values = (
+                f0_buf / self.sampling_rate
+            ) % 1  ###%1意味着n_har的乘积无法后处理优化
+            rand_ini = torch.rand(
+                f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
+            )
+            rand_ini[:, 0] = 0
+            rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+            tmp_over_one = torch.cumsum(
+                rad_values, 1
+            )  # % 1  #####%1意味着后面的cumsum无法再优化
+            tmp_over_one *= upp
+            tmp_over_one = F.interpolate(
+                tmp_over_one.transpose(2, 1),
+                scale_factor=upp,
+                mode="linear",
+                align_corners=True,
+            ).transpose(2, 1)
+            rad_values = F.interpolate(
+                rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(
+                2, 1
+            )  #######
+            tmp_over_one %= 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sine_waves = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+            sine_waves = sine_waves * self.sine_amp
+            uv = self._f02uv(f0)
+            uv = F.interpolate(
+                uv.transpose(2, 1), scale_factor=upp, mode="nearest"
+            ).transpose(2, 1)
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self,
+        sampling_rate,
+        harmonic_num=0,
+        sine_amp=0.1,
+        add_noise_std=0.003,
+        voiced_threshod=0,
+        is_half=True,
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        self.is_half = is_half
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(
+            sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
+        )
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x, upp=None):
+        sine_wavs, uv, _ = self.l_sin_gen(x, upp)
+        if self.is_half:
+            sine_wavs = sine_wavs.half()
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        return sine_merge, None, None  # noise, uv
+class GeneratorNSF(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels,
+        sr,
+        is_half=False,
+    ):
+        super(GeneratorNSF, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sr, harmonic_num=0, is_half=is_half
+        )
+        self.noise_convs = nn.ModuleList()
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+            if i + 1 < len(upsample_rates):
+                stride_f0 = np.prod(upsample_rates[i + 1 :])
+                self.noise_convs.append(
+                    Conv1d(
+                        1,
+                        c_cur,
+                        kernel_size=stride_f0 * 2,
+                        stride=stride_f0,
+                        padding=stride_f0 // 2,
+                    )
+                )
+            else:
+                self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+        self.upp = np.prod(upsample_rates)
+    def forward(self, x, f0, g=None):
+        har_source, noi_source, uv = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+sr2sr = {
+    "32k": 32000,
+    "40k": 40000,
+    "48k": 48000,
+}
+class SynthesizerTrnMs256NSFsidM(nn.Module):
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        gin_channels,
+        sr,
+        **kwargs
+    ):
+        super().__init__()
+        if type(sr) == type("strr"):
+            sr = sr2sr[sr]
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            sr=sr,
+            is_half=kwargs["is_half"],
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        return o
+class SynthesizerTrnMs256NSFsid_sim(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        spk_embed_dim,
+        # hop_length,
+        gin_channels=0,
+        use_sdp=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        # self.hop_length = hop_length#
+        self.spk_embed_dim = spk_embed_dim
+        self.enc_p = TextEncoder256Sim(
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+        )
+        self.dec = GeneratorNSF(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+            is_half=kwargs["is_half"],
+        )
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
+        print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
+    def remove_weight_norm(self):
+        self.dec.remove_weight_norm()
+        self.flow.remove_weight_norm()
+        self.enc_q.remove_weight_norm()
+    def forward(
+        self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
+    ):  # y是spec不需要了现在
+        g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1)  # [b, 256, 1]##1是t，广播的
+        x, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        x = self.flow(x, x_mask, g=g, reverse=True)
+        o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
+        return o
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11, 17]
+        # periods = [3, 5, 7, 11, 17, 23, 37]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []  #
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            # for j in range(len(fmap_r)):
+            #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap

src/vc/infer_pack/modules.py ADDED Viewed

	@@ -0,0 +1,522 @@

+import copy
+import math
+import numpy as np
+import scipy
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm
+from vc.infer_pack import commons
+from vc.infer_pack.commons import init_weights, get_padding
+from vc.infer_pack.transforms import piecewise_rational_quadratic_transform
+LRELU_SLOPE = 0.1
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+class ConvReluNorm(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        hidden_channels,
+        out_channels,
+        kernel_size,
+        n_layers,
+        p_dropout,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        assert n_layers > 1, "Number of layers should be larger than 0."
+        self.conv_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.conv_layers.append(
+            nn.Conv1d(
+                in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
+            )
+        )
+        self.norm_layers.append(LayerNorm(hidden_channels))
+        self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
+        for _ in range(n_layers - 1):
+            self.conv_layers.append(
+                nn.Conv1d(
+                    hidden_channels,
+                    hidden_channels,
+                    kernel_size,
+                    padding=kernel_size // 2,
+                )
+            )
+            self.norm_layers.append(LayerNorm(hidden_channels))
+        self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask):
+        x_org = x
+        for i in range(self.n_layers):
+            x = self.conv_layers[i](x * x_mask)
+            x = self.norm_layers[i](x)
+            x = self.relu_drop(x)
+        x = x_org + self.proj(x)
+        return x * x_mask
+class DDSConv(nn.Module):
+    """
+    Dialted and Depth-Separable Convolution
+    """
+    def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
+        super().__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.p_dropout = p_dropout
+        self.drop = nn.Dropout(p_dropout)
+        self.convs_sep = nn.ModuleList()
+        self.convs_1x1 = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(n_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_sep.append(
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(LayerNorm(channels))
+            self.norms_2.append(LayerNorm(channels))
+    def forward(self, x, x_mask, g=None):
+        if g is not None:
+            x = x + g
+        for i in range(self.n_layers):
+            y = self.convs_sep[i](x * x_mask)
+            y = self.norms_1[i](y)
+            y = F.gelu(y)
+            y = self.convs_1x1[i](y)
+            y = self.norms_2[i](y)
+            y = F.gelu(y)
+            y = self.drop(y)
+            x = x + y
+        return x * x_mask
+class WN(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+        p_dropout=0,
+    ):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(
+                gin_channels, 2 * hidden_channels * n_layers, 1
+            )
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(
+                hidden_channels,
+                2 * hidden_channels,
+                kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+        if g is not None:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+            acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+            ]
+        )
+        self.convs.apply(init_weights)
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Log(nn.Module):
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
+            logdet = torch.sum(-y, [1, 2])
+            return y, logdet
+        else:
+            x = torch.exp(x) * x_mask
+            return x
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+class ElementwiseAffine(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.channels = channels
+        self.m = nn.Parameter(torch.zeros(channels, 1))
+        self.logs = nn.Parameter(torch.zeros(channels, 1))
+    def forward(self, x, x_mask, reverse=False, **kwargs):
+        if not reverse:
+            y = self.m + torch.exp(self.logs) * x
+            y = y * x_mask
+            logdet = torch.sum(self.logs * x_mask, [1, 2])
+            return y, logdet
+        else:
+            x = (x - self.m) * torch.exp(-self.logs) * x_mask
+            return x
+class ResidualCouplingLayer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        p_dropout=0,
+        gin_channels=0,
+        mean_only=False,
+    ):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            p_dropout=p_dropout,
+            gin_channels=gin_channels,
+        )
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
+    def remove_weight_norm(self):
+        self.enc.remove_weight_norm()
+class ConvFlow(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        n_layers,
+        num_bins=10,
+        tail_bound=5.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.num_bins = num_bins
+        self.tail_bound = tail_bound
+        self.half_channels = in_channels // 2
+        self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
+        self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
+        self.proj = nn.Conv1d(
+            filter_channels, self.half_channels * (num_bins * 3 - 1), 1
+        )
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0)
+        h = self.convs(h, x_mask, g=g)
+        h = self.proj(h) * x_mask
+        b, c, t = x0.shape
+        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
+        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
+            self.filter_channels
+        )
+        unnormalized_derivatives = h[..., 2 * self.num_bins :]
+        x1, logabsdet = piecewise_rational_quadratic_transform(
+            x1,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            inverse=reverse,
+            tails="linear",
+            tail_bound=self.tail_bound,
+        )
+        x = torch.cat([x0, x1], 1) * x_mask
+        logdet = torch.sum(logabsdet * x_mask, [1, 2])
+        if not reverse:
+            return x, logdet
+        else:
+            return x

src/vc/infer_pack/transforms.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import torch
+from torch.nn import functional as F
+import numpy as np
+DEFAULT_MIN_BIN_WIDTH = 1e-3
+DEFAULT_MIN_BIN_HEIGHT = 1e-3
+DEFAULT_MIN_DERIVATIVE = 1e-3
+def piecewise_rational_quadratic_transform(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails=None,
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if tails is None:
+        spline_fn = rational_quadratic_spline
+        spline_kwargs = {}
+    else:
+        spline_fn = unconstrained_rational_quadratic_spline
+        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
+    outputs, logabsdet = spline_fn(
+        inputs=inputs,
+        unnormalized_widths=unnormalized_widths,
+        unnormalized_heights=unnormalized_heights,
+        unnormalized_derivatives=unnormalized_derivatives,
+        inverse=inverse,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+        **spline_kwargs
+    )
+    return outputs, logabsdet
+def searchsorted(bin_locations, inputs, eps=1e-6):
+    bin_locations[..., -1] += eps
+    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+def unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    tails="linear",
+    tail_bound=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+    outputs = torch.zeros_like(inputs)
+    logabsdet = torch.zeros_like(inputs)
+    if tails == "linear":
+        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
+        constant = np.log(np.exp(1 - min_derivative) - 1)
+        unnormalized_derivatives[..., 0] = constant
+        unnormalized_derivatives[..., -1] = constant
+        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        logabsdet[outside_interval_mask] = 0
+    else:
+        raise RuntimeError("{} tails are not implemented.".format(tails))
+    (
+        outputs[inside_interval_mask],
+        logabsdet[inside_interval_mask],
+    ) = rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        inverse=inverse,
+        left=-tail_bound,
+        right=tail_bound,
+        bottom=-tail_bound,
+        top=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+    return outputs, logabsdet
+def rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    inverse=False,
+    left=0.0,
+    right=1.0,
+    bottom=0.0,
+    top=1.0,
+    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
+    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
+    min_derivative=DEFAULT_MIN_DERIVATIVE,
+):
+    if torch.min(inputs) < left or torch.max(inputs) > right:
+        raise ValueError("Input to a transform is not within its domain")
+    num_bins = unnormalized_widths.shape[-1]
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError("Minimal bin width too large for the number of bins")
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError("Minimal bin height too large for the number of bins")
+    widths = F.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (right - left) * cumwidths + left
+    cumwidths[..., 0] = left
+    cumwidths[..., -1] = right
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
+    heights = F.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (top - bottom) * cumheights + bottom
+    cumheights[..., 0] = bottom
+    cumheights[..., -1] = top
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+    if inverse:
+        bin_idx = searchsorted(cumheights, inputs)[..., None]
+    else:
+        bin_idx = searchsorted(cumwidths, inputs)[..., None]
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+    if inverse:
+        a = (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        ) + input_heights * (input_delta - input_derivatives)
+        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
+            input_derivatives + input_derivatives_plus_one - 2 * input_delta
+        )
+        c = -input_delta * (inputs - input_cumheights)
+        discriminant = b.pow(2) - 4 * a * c
+        assert (discriminant >= 0).all()
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, -logabsdet
+    else:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+        numerator = input_heights * (
+            input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
+        )
+        denominator = input_delta + (
+            (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
+            * theta_one_minus_theta
+        )
+        outputs = input_cumheights + numerator / denominator
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - theta).pow(2)
+        )
+        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, logabsdet

src/vc/my_utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import ffmpeg
+import numpy as np
+def load_audio(file, sr):
+    try:
+        # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
+        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        file = (
+            file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
+        )  # 防止小白拷路径头尾带了空格和"和回车
+        out, _ = (
+            ffmpeg.input(file, threads=0)
+            .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
+            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+        )
+    except Exception as e:
+        raise RuntimeError(f"Failed to load audio: {e}")
+    return np.frombuffer(out, np.float32).flatten()

src/vc/rmvpe.py ADDED Viewed

	@@ -0,0 +1,409 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from librosa.filters import mel
+class BiGRU(nn.Module):
+    def __init__(self, input_features, hidden_features, num_layers):
+        super(BiGRU, self).__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True,
+        )
+    def forward(self, x):
+        return self.gru(x)[0]
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, momentum=0.01):
+        super(ConvBlockRes, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=(1, 1),
+                padding=(1, 1),
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+    def forward(self, x):
+        if self.is_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        in_size,
+        n_encoders,
+        kernel_size,
+        n_blocks,
+        out_channels=16,
+        momentum=0.01,
+    ):
+        super(Encoder, self).__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(self.n_encoders):
+            self.layers.append(
+                ResEncoderBlock(
+                    in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
+                )
+            )
+            self.latent_channels.append([out_channels, in_size])
+            in_channels = out_channels
+            out_channels *= 2
+            in_size //= 2
+        self.out_size = in_size
+        self.out_channel = out_channels
+    def forward(self, x):
+        concat_tensors = []
+        x = self.bn(x)
+        for i in range(self.n_encoders):
+            _, x = self.layers[i](x)
+            concat_tensors.append(_)
+        return x, concat_tensors
+class ResEncoderBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
+    ):
+        super(ResEncoderBlock, self).__init__()
+        self.n_blocks = n_blocks
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.kernel_size = kernel_size
+        if self.kernel_size is not None:
+            self.pool = nn.AvgPool2d(kernel_size=kernel_size)
+    def forward(self, x):
+        for i in range(self.n_blocks):
+            x = self.conv[i](x)
+        if self.kernel_size is not None:
+            return x, self.pool(x)
+        else:
+            return x
+class Intermediate(nn.Module):  #
+    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
+        super(Intermediate, self).__init__()
+        self.n_inters = n_inters
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
+        )
+        for i in range(self.n_inters - 1):
+            self.layers.append(
+                ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
+            )
+    def forward(self, x):
+        for i in range(self.n_inters):
+            x = self.layers[i](x)
+        return x
+class ResDecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
+        super(ResDecoderBlock, self).__init__()
+        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
+        self.n_blocks = n_blocks
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(3, 3),
+                stride=stride,
+                padding=(1, 1),
+                output_padding=out_padding,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+        )
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for i in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        x = torch.cat((x, concat_tensor), dim=1)
+        for i in range(self.n_blocks):
+            x = self.conv2[i](x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
+        super(Decoder, self).__init__()
+        self.layers = nn.ModuleList()
+        self.n_decoders = n_decoders
+        for i in range(self.n_decoders):
+            out_channels = in_channels // 2
+            self.layers.append(
+                ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
+            )
+            in_channels = out_channels
+    def forward(self, x, concat_tensors):
+        for i in range(self.n_decoders):
+            x = self.layers[i](x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    def __init__(
+        self,
+        kernel_size,
+        n_blocks,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(DeepUnet, self).__init__()
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            self.encoder.out_channel // 2,
+            self.encoder.out_channel,
+            inter_layers,
+            n_blocks,
+        )
+        self.decoder = Decoder(
+            self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
+        )
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+class E2E(nn.Module):
+    def __init__(
+        self,
+        n_blocks,
+        n_gru,
+        kernel_size,
+        en_de_layers=5,
+        inter_layers=4,
+        in_channels=1,
+        en_out_channels=16,
+    ):
+        super(E2E, self).__init__()
+        self.unet = DeepUnet(
+            kernel_size,
+            n_blocks,
+            en_de_layers,
+            inter_layers,
+            in_channels,
+            en_out_channels,
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid(),
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
+            )
+    def forward(self, mel):
+        mel = mel.transpose(-1, -2).unsqueeze(1)
+        x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
+        x = self.fc(x)
+        return x
+class MelSpectrogram(torch.nn.Module):
+    def __init__(
+        self,
+        is_half,
+        n_mel_channels,
+        sampling_rate,
+        win_length,
+        hop_length,
+        n_fft=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        clamp=1e-5,
+    ):
+        super().__init__()
+        n_fft = win_length if n_fft is None else n_fft
+        self.hann_window = {}
+        mel_basis = mel(
+            sr=sampling_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=mel_fmin,
+            fmax=mel_fmax,
+            htk=True,
+        )
+        mel_basis = torch.from_numpy(mel_basis).float()
+        self.register_buffer("mel_basis", mel_basis)
+        self.n_fft = win_length if n_fft is None else n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.sampling_rate = sampling_rate
+        self.n_mel_channels = n_mel_channels
+        self.clamp = clamp
+        self.is_half = is_half
+    def forward(self, audio, keyshift=0, speed=1, center=True):
+        factor = 2 ** (keyshift / 12)
+        n_fft_new = int(np.round(self.n_fft * factor))
+        win_length_new = int(np.round(self.win_length * factor))
+        hop_length_new = int(np.round(self.hop_length * speed))
+        keyshift_key = str(keyshift) + "_" + str(audio.device)
+        if keyshift_key not in self.hann_window:
+            self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
+                audio.device
+            )
+        fft = torch.stft(
+            audio,
+            n_fft=n_fft_new,
+            hop_length=hop_length_new,
+            win_length=win_length_new,
+            window=self.hann_window[keyshift_key],
+            center=center,
+            return_complex=True,
+        )
+        magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
+        if keyshift != 0:
+            size = self.n_fft // 2 + 1
+            resize = magnitude.size(1)
+            if resize < size:
+                magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
+            magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
+        mel_output = torch.matmul(self.mel_basis, magnitude)
+        if self.is_half == True:
+            mel_output = mel_output.half()
+        log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
+        return log_mel_spec
+class RMVPE:
+    def __init__(self, model_path, is_half, device=None):
+        self.resample_kernel = {}
+        model = E2E(4, 1, (2, 2))
+        ckpt = torch.load(model_path, map_location="cpu")
+        model.load_state_dict(ckpt)
+        model.eval()
+        if is_half == True:
+            model = model.half()
+        self.model = model
+        self.resample_kernel = {}
+        self.is_half = is_half
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = device
+        self.mel_extractor = MelSpectrogram(
+            is_half, 128, 16000, 1024, 160, None, 30, 8000
+        ).to(device)
+        self.model = self.model.to(device)
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))  # 368
+    def mel2hidden(self, mel):
+        with torch.no_grad():
+            n_frames = mel.shape[-1]
+            mel = F.pad(
+                mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
+            )
+            hidden = self.model(mel)
+            return hidden[:, :n_frames]
+    def decode(self, hidden, thred=0.03):
+        cents_pred = self.to_local_average_cents(hidden, thred=thred)
+        f0 = 10 * (2 ** (cents_pred / 1200))
+        f0[f0 == 10] = 0
+        # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
+        return f0
+    def infer_from_audio(self, audio, thred=0.03):
+        audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
+        # torch.cuda.synchronize()
+        # t0=ttime()
+        mel = self.mel_extractor(audio, center=True)
+        # torch.cuda.synchronize()
+        # t1=ttime()
+        hidden = self.mel2hidden(mel)
+        # torch.cuda.synchronize()
+        # t2=ttime()
+        hidden = hidden.squeeze(0).cpu().numpy()
+        if self.is_half == True:
+            hidden = hidden.astype("float32")
+        f0 = self.decode(hidden, thred=thred)
+        # torch.cuda.synchronize()
+        # t3=ttime()
+        # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
+        return f0
+    def to_local_average_cents(self, salience, thred=0.05):
+        # t0 = ttime()
+        center = np.argmax(salience, axis=1)  # 帧长#index
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # 帧长,368
+        # t1 = ttime()
+        center += 4
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
+        # t2 = ttime()
+        todo_salience = np.array(todo_salience)  # 帧长，9
+        todo_cents_mapping = np.array(todo_cents_mapping)  # 帧长，9
+        product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
+        weight_sum = np.sum(todo_salience, 1)  # 帧长
+        devided = product_sum / weight_sum  # 帧长
+        # t3 = ttime()
+        maxx = np.max(salience, axis=1)  # 帧长
+        devided[maxx <= thred] = 0
+        # t4 = ttime()
+        # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
+        return devided

src/vc/rvc.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from typing import Any
+from typings.extra import F0Method
+from multiprocessing import cpu_count
+from pathlib import Path
+import torch
+from fairseq import checkpoint_utils
+from scipy.io import wavfile
+from vc.infer_pack.models import (
+    SynthesizerTrnMs256NSFsid,
+    SynthesizerTrnMs256NSFsid_nono,
+    SynthesizerTrnMs768NSFsid,
+    SynthesizerTrnMs768NSFsid_nono,
+)
+from vc.my_utils import load_audio
+from vc.vc_infer_pipeline import VC
+SRC_DIR = Path(__file__).resolve().parent.parent
+class Config:
+    def __init__(self, device, is_half):
+        self.device = device
+        self.is_half = is_half
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.gpu_mem = None
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                print("16 series/10 series P40 forced single precision")
+                self.is_half = False
+                for config_file in ["32k.json", "40k.json", "48k.json"]:
+                    with open(SRC_DIR / "vc" / "configs" / config_file, "r") as f:
+                        strr = f.read().replace("true", "false")
+                    with open(SRC_DIR / "vc" / "configs" / config_file, "w") as f:
+                        f.write(strr)
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "r"
+                ) as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "w"
+                ) as f:
+                    f.write(strr)
+            else:
+                self.gpu_name = None
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+            if self.gpu_mem <= 4:
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "r"
+                ) as f:
+                    strr = f.read().replace("3.7", "3.0")
+                with open(
+                    SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "w"
+                ) as f:
+                    f.write(strr)
+        elif torch.backends.mps.is_available():
+            print("No supported N-card found, use MPS for inference")
+            self.device = "mps"
+        else:
+            print("No supported N-card found, use CPU for inference")
+            self.device = "cpu"
+            self.is_half = True
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+        if self.is_half:
+            # 6G memory config
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G memory config
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+        if self.gpu_mem != None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+        return x_pad, x_query, x_center, x_max
+def load_hubert(device: str, is_half: bool, model_path: str) -> torch.nn.Module:
+    models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
+        [model_path],
+        suffix="",
+    )
+    hubert = models[0]
+    hubert = hubert.to(device)
+    if is_half:
+        hubert = hubert.half()
+    else:
+        hubert = hubert.float()
+    hubert.eval()
+    return hubert
+def get_vc(
+    device: str, is_half: bool, config: Config, model_path: str
+) -> tuple[dict[str, Any], str, torch.nn.Module, int, VC]:
+    cpt = torch.load(model_path, map_location="cpu")
+    if "config" not in cpt or "weight" not in cpt:
+        raise ValueError(
+            f"Incorrect format for {model_path}. Use a voice model trained using RVC v2 instead."
+        )
+    tgt_sr = cpt["config"][-1]
+    cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
+    if_f0 = cpt.get("f0", 1)
+    version = cpt.get("version", "v1")
+    if version == "v1":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
+    elif version == "v2":
+        if if_f0 == 1:
+            net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
+        else:
+            net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
+    del net_g.enc_q
+    print(net_g.load_state_dict(cpt["weight"], strict=False))
+    net_g.eval().to(device)
+    if is_half:
+        net_g = net_g.half()
+    else:
+        net_g = net_g.float()
+    vc = VC(tgt_sr, config)
+    return cpt, version, net_g, tgt_sr, vc
+def rvc_infer(
+    index_path: str,
+    index_rate: float,
+    input_path: str,
+    output_path: str,
+    pitch_change: int,
+    f0_method: F0Method,
+    cpt: dict[str, Any],
+    version: str,
+    net_g: torch.nn.Module,
+    filter_radius: int,
+    tgt_sr: int,
+    rms_mix_rate: float,
+    protect: float,
+    crepe_hop_length: int,
+    vc: VC,
+    hubert_model: torch.nn.Module,
+    resample_sr: int,
+) -> None:
+    audio = load_audio(input_path, 16000)
+    times = [0, 0, 0]
+    if_f0 = cpt.get("f0", 1)
+    audio_opt, output_sr = vc.pipeline(
+        hubert_model,
+        net_g,
+        0,
+        audio,
+        input_path,
+        times,
+        pitch_change,
+        f0_method,
+        index_path,
+        index_rate,
+        if_f0,
+        filter_radius,
+        tgt_sr,
+        resample_sr,
+        rms_mix_rate,
+        version,
+        protect,
+        crepe_hop_length,
+    )
+    wavfile.write(output_path, output_sr, audio_opt)

src/vc/trainset_preprocess_pipeline_print.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import sys, os, multiprocessing
+from scipy import signal
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+inp_root = sys.argv[1]
+sr = int(sys.argv[2])
+n_p = int(sys.argv[3])
+exp_dir = sys.argv[4]
+noparallel = sys.argv[5] == "True"
+import numpy as np, os, traceback
+from slicer2 import Slicer
+import librosa, traceback
+from scipy.io import wavfile
+import multiprocessing
+from vc.my_utils import load_audio
+import tqdm
+DoFormant = False
+Quefrency = 1.0
+Timbre = 1.0
+mutex = multiprocessing.Lock()
+f = open("%s/preprocess.log" % exp_dir, "a+")
+def println(strr):
+    mutex.acquire()
+    print(strr)
+    f.write("%s\n" % strr)
+    f.flush()
+    mutex.release()
+class PreProcess:
+    def __init__(self, sr, exp_dir):
+        self.slicer = Slicer(
+            sr=sr,
+            threshold=-42,
+            min_length=1500,
+            min_interval=400,
+            hop_size=15,
+            max_sil_kept=500,
+        )
+        self.sr = sr
+        self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
+        self.per = 3.0
+        self.overlap = 0.3
+        self.tail = self.per + self.overlap
+        self.max = 0.9
+        self.alpha = 0.75
+        self.exp_dir = exp_dir
+        self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
+        self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
+        os.makedirs(self.exp_dir, exist_ok=True)
+        os.makedirs(self.gt_wavs_dir, exist_ok=True)
+        os.makedirs(self.wavs16k_dir, exist_ok=True)
+    def norm_write(self, tmp_audio, idx0, idx1):
+        tmp_max = np.abs(tmp_audio).max()
+        if tmp_max > 2.5:
+            print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
+            return
+        tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
+            1 - self.alpha
+        ) * tmp_audio
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
+            self.sr,
+            tmp_audio.astype(np.float32),
+        )
+        tmp_audio = librosa.resample(
+            tmp_audio, orig_sr=self.sr, target_sr=16000
+        )  # , res_type="soxr_vhq"
+        wavfile.write(
+            "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
+            16000,
+            tmp_audio.astype(np.float32),
+        )
+    def pipeline(self, path, idx0):
+        try:
+            audio = load_audio(path, self.sr, DoFormant, Quefrency, Timbre)
+            # zero phased digital filter cause pre-ringing noise...
+            # audio = signal.filtfilt(self.bh, self.ah, audio)
+            audio = signal.lfilter(self.bh, self.ah, audio)
+            idx1 = 0
+            for audio in self.slicer.slice(audio):
+                i = 0
+                while 1:
+                    start = int(self.sr * (self.per - self.overlap) * i)
+                    i += 1
+                    if len(audio[start:]) > self.tail * self.sr:
+                        tmp_audio = audio[start : start + int(self.per * self.sr)]
+                        self.norm_write(tmp_audio, idx0, idx1)
+                        idx1 += 1
+                    else:
+                        tmp_audio = audio[start:]
+                        idx1 += 1
+                        break
+                self.norm_write(tmp_audio, idx0, idx1)
+            # println("%s->Suc." % path)
+        except:
+            println("%s->%s" % (path, traceback.format_exc()))
+    def pipeline_mp(self, infos, thread_n):
+        for path, idx0 in tqdm.tqdm(
+            infos, position=thread_n, leave=True, desc="thread:%s" % thread_n
+        ):
+            self.pipeline(path, idx0)
+    def pipeline_mp_inp_dir(self, inp_root, n_p):
+        try:
+            infos = [
+                ("%s/%s" % (inp_root, name), idx)
+                for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
+            ]
+            if noparallel:
+                for i in range(n_p):
+                    self.pipeline_mp(infos[i::n_p])
+            else:
+                ps = []
+                for i in range(n_p):
+                    p = multiprocessing.Process(
+                        target=self.pipeline_mp, args=(infos[i::n_p], i)
+                    )
+                    ps.append(p)
+                    p.start()
+                for i in range(n_p):
+                    ps[i].join()
+        except:
+            println("Fail. %s" % traceback.format_exc())
+def preprocess_trainset(inp_root, sr, n_p, exp_dir):
+    pp = PreProcess(sr, exp_dir)
+    println("start preprocess")
+    println(sys.argv)
+    pp.pipeline_mp_inp_dir(inp_root, n_p)
+    println("end preprocess")
+if __name__ == "__main__":
+    preprocess_trainset(inp_root, sr, n_p, exp_dir)