Blane187 commited on
Commit
c8be32d
·
verified ·
1 Parent(s): 91f5864

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -35
  2. .gitignore +166 -0
  3. LICENSE +21 -0
  4. README.md +184 -13
  5. images/webui_dl_model.png +0 -0
  6. images/webui_generate.png +0 -0
  7. images/webui_upload_model.png +0 -0
  8. models/rvc/MODELS.txt +2 -0
  9. models/rvc/public_models.json +626 -0
  10. notebooks/ultimate_rvc_colab.ipynb +134 -0
  11. pyproject.toml +105 -0
  12. requirements.txt +55 -0
  13. src/app.py +214 -0
  14. src/backend/common.py +259 -0
  15. src/backend/exceptions.py +43 -0
  16. src/backend/generate_song_cover.py +1679 -0
  17. src/backend/manage_audio.py +225 -0
  18. src/backend/manage_voice_models.py +426 -0
  19. src/cli.py +219 -0
  20. src/common.py +10 -0
  21. src/frontend/common.py +466 -0
  22. src/frontend/tabs/manage_audio.py +216 -0
  23. src/frontend/tabs/manage_models.py +302 -0
  24. src/frontend/tabs/multi_step_generation.py +991 -0
  25. src/frontend/tabs/one_click_generation.py +573 -0
  26. src/init.py +41 -0
  27. src/typings/audio_separator/separator/__init__.pyi +78 -0
  28. src/typings/extra.py +71 -0
  29. src/typings/gradio/__init__.pyi +238 -0
  30. src/typings/gradio/events.pyi +374 -0
  31. src/typings/pedalboard_native/io/__init__.pyi +39 -0
  32. src/typings/soundfile/__init__.pyi +34 -0
  33. src/typings/sox/__init__.pyi +15 -0
  34. src/typings/yt_dlp/__init__.pyi +25 -0
  35. src/vc/configs/32k.json +46 -0
  36. src/vc/configs/32k_v2.json +46 -0
  37. src/vc/configs/40k.json +46 -0
  38. src/vc/configs/48k.json +46 -0
  39. src/vc/configs/48k_v2.json +46 -0
  40. src/vc/infer_pack/attentions.py +417 -0
  41. src/vc/infer_pack/commons.py +166 -0
  42. src/vc/infer_pack/models.py +1128 -0
  43. src/vc/infer_pack/models_onnx.py +822 -0
  44. src/vc/infer_pack/models_onnx_moess.py +853 -0
  45. src/vc/infer_pack/modules.py +522 -0
  46. src/vc/infer_pack/transforms.py +209 -0
  47. src/vc/my_utils.py +21 -0
  48. src/vc/rmvpe.py +409 -0
  49. src/vc/rvc.py +205 -0
  50. src/vc/trainset_preprocess_pipeline_print.py +146 -0
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # General
2
+ dependencies
3
+ audio
4
+
5
+ # Audio separation models
6
+ models/audio_separator
7
+
8
+ # RVC Models
9
+ models/rvc/*/*.pth
10
+ models/rvc/*/*.index
11
+ models/rvc/*/*.npy
12
+ models/rvc/hubert_base.pt
13
+ models/rvc/rmvpe.pt
14
+
15
+ # Byte-compiled / optimized / DLL files
16
+ __pycache__/
17
+ *.py[cod]
18
+ *$py.class
19
+
20
+ # C extensions
21
+ *.so
22
+
23
+ # Distribution / packaging
24
+ .Python
25
+ build/
26
+ develop-eggs/
27
+ dist/
28
+ downloads/
29
+ eggs/
30
+ .eggs/
31
+ lib/
32
+ lib64/
33
+ parts/
34
+ sdist/
35
+ var/
36
+ wheels/
37
+ share/python-wheels/
38
+ *.egg-info/
39
+ .installed.cfg
40
+ *.egg
41
+ MANIFEST
42
+
43
+ # PyInstaller
44
+ # Usually these files are written by a python script from a template
45
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
46
+ *.manifest
47
+ *.spec
48
+
49
+ # Installer logs
50
+ pip-log.txt
51
+ pip-delete-this-directory.txt
52
+
53
+ # Unit test / coverage reports
54
+ htmlcov/
55
+ .tox/
56
+ .nox/
57
+ .coverage
58
+ .coverage.*
59
+ .cache
60
+ nosetests.xml
61
+ coverage.xml
62
+ *.cover
63
+ *.py,cover
64
+ .hypothesis/
65
+ .pytest_cache/
66
+ cover/
67
+
68
+ # Translations
69
+ *.mo
70
+ *.pot
71
+
72
+ # Django stuff:
73
+ *.log
74
+ local_settings.py
75
+ db.sqlite3
76
+ db.sqlite3-journal
77
+
78
+ # Flask stuff:
79
+ instance/
80
+ .webassets-cache
81
+
82
+ # Scrapy stuff:
83
+ .scrapy
84
+
85
+ # Sphinx documentation
86
+ docs/_build/
87
+
88
+ # PyBuilder
89
+ .pybuilder/
90
+ target/
91
+
92
+ # Jupyter Notebook
93
+ .ipynb_checkpoints
94
+
95
+ # IPython
96
+ profile_default/
97
+ ipython_config.py
98
+
99
+ # pyenv
100
+ # For a library or package, you might want to ignore these files since the code is
101
+ # intended to run in multiple environments; otherwise, check them in:
102
+ # .python-version
103
+
104
+ # pipenv
105
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
106
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
107
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
108
+ # install all needed dependencies.
109
+ #Pipfile.lock
110
+
111
+ # poetry
112
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
113
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
114
+ # commonly ignored for libraries.
115
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
116
+ #poetry.lock
117
+
118
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
119
+ __pypackages__/
120
+
121
+ # Celery stuff
122
+ celerybeat-schedule
123
+ celerybeat.pid
124
+
125
+ # SageMath parsed files
126
+ *.sage.py
127
+
128
+ # Environments
129
+ .env
130
+ .venv
131
+ env/
132
+ venv/
133
+ ENV/
134
+ env.bak/
135
+ venv.bak/
136
+
137
+ # Spyder project settings
138
+ .spyderproject
139
+ .spyproject
140
+
141
+ # Rope project settings
142
+ .ropeproject
143
+
144
+ # mkdocs documentation
145
+ /site
146
+
147
+ # mypy
148
+ .mypy_cache/
149
+ .dmypy.json
150
+ dmypy.json
151
+
152
+ # Pyre type checker
153
+ .pyre/
154
+
155
+ # pytype static type analyzer
156
+ .pytype/
157
+
158
+ # Cython debug symbols
159
+ cython_debug/
160
+
161
+ # PyCharm
162
+ # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
163
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
165
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
166
+ .idea/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 SociallyIneptWeeb
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,184 @@
1
- ---
2
- title: Ultimate Rvc
3
- emoji: 🔥
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 4.43.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ultimate RVC
2
+
3
+ An extension of [AiCoverGen](https://github.com/SociallyIneptWeeb/AICoverGen), which provides several new features and improvements, enabling users to generate song covers using RVC with ease. Ideal for people who want to incorporate singing functionality into their AI assistant/chatbot/vtuber, or for people who want to hear their favourite characters sing their favourite song.
4
+
5
+ <!-- Showcase: TBA -->
6
+
7
+ ![](images/webui_generate.png?raw=true)
8
+
9
+ Ultimate RVC is under constant development and testing, but you can try it out right now locally or on Google Colab!
10
+
11
+ ## New Features
12
+
13
+ * Easy and automated setup using launcher scripts for both windows and debian-based linux systems
14
+ * Caching system which saves intermediate audio files as needed, thereby reducing inference time as much as possible. For example, if song A has already been converted using model B and now you want to convert song A using model C, then vocal extraction can be skipped and inference time reduced drastically
15
+ * Ability to listen to intermediate audio files in the UI. This is useful for getting an idea of what is happening in each step of the song cover generation pipeline
16
+ * A "multi-step" song cover generation tab: here you can try out each step of the song cover generation pipeline in isolation. For example, if you already have extracted vocals available and only want to convert these using your voice model, then you can do that here. Besides, this tab is useful for experimenting with settings for each step of the song cover generation pipeline
17
+ * An overhaul of the song input component for the song cover generation pipeline. Now cached input songs can be selected from a dropdown, so that you don't have to supply the Youtube link of a song each time you want to convert it.
18
+ * A new "manage models" tab, which collects and revamps all existing functionality for managing voice models, as well as adds some new features, such as the ability to delete existing models
19
+ * A "manage audio", which allows you to to interact with all audio generated by the app. Currently, this tab supports deleting audio files.
20
+ * Lots of visual and performance improvements resulting from updating from Gradio 3 to Gradio 4 and from python 3.9 to python 3.11
21
+
22
+ <!-- ## Changelog
23
+
24
+ TBA -->
25
+
26
+ #### PRO TIP: Use a GPU for faster processing
27
+
28
+ While it is possible to run the Ultimate RVC web app on a CPU, it is highly recommended to use a GPU for faster processing. On an NVIDIA 3080 GPU, the AI cover generation process takes approximately 1.5 minutes, while on a CPU, it takes approximately 15 minutes. No testing has been done on AMD GPUs, so no guarantees are made for their performance.
29
+
30
+ ## Colab notebook
31
+
32
+ For those without a powerful enough NVIDIA GPU, you may try Ultimate RVC out using Google Colab.
33
+
34
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JackismyShephard/ultimate-rvc/blob/main/notebooks/ultimate_rvc_colab.ipynb)
35
+
36
+ For those who want to run this locally, follow the setup guide below.
37
+
38
+ ## Setup
39
+
40
+ ### Install Git
41
+
42
+ Follow the instructions [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) to install Git on your computer.
43
+
44
+ ### Clone Ultimate RVC repository
45
+ Open a terminal and run the following commands to clone this entire repository and open it locally.
46
+ ```
47
+ git clone https://github.com/JackismyShephard/ultimate-rvc
48
+ cd ultimate-rvc
49
+ ```
50
+
51
+ ### Install dependencies
52
+
53
+ #### Windows
54
+ Run the following command to install the necessary dependencies on Windows:
55
+ ```
56
+ ./urvc.bat install
57
+ ```
58
+ Note that this will install Miniconda in your user directory.
59
+ The whole process may take upwards of 10 minutes, so grab a cup of coffee and wait.
60
+
61
+ #### Linux (Debian-based)
62
+
63
+ Run the following command to install the necessary dependencies on Debian-based Linux distributions (e.g. Ubuntu):
64
+ ```
65
+ ./urvc.sh install
66
+ ```
67
+ The command has been tested only on Ubuntu 22.04 and 24.04 so support for other distributions is not guaranteed.
68
+ Also note that the command will install the CUDA 12.1 toolkit system-wide. In case you have problems, you may need to install the toolkit manually.
69
+
70
+ ## Usage
71
+
72
+ ### Start the app
73
+
74
+ #### Windows
75
+
76
+ ```
77
+ ./urvc.bat run
78
+ ```
79
+ #### Linux (Debian-based)
80
+
81
+ ```
82
+ ./urvc.sh run
83
+ ```
84
+
85
+
86
+ Once the following output message `Running on local URL: http://127.0.0.1:7860` appears, you can click on the link to open a tab with the web app.
87
+
88
+ ### Manage models
89
+
90
+
91
+ #### Download models
92
+
93
+ ![](images/webui_dl_model.png?raw=true)
94
+
95
+ Navigate to the `Download model` subtab under the `Manage models` tab, and paste the download link to an RVC model and give it a unique name.
96
+ You may search the [AI Hub Discord](https://discord.gg/aihub) where already trained voice models are available for download.
97
+ The downloaded zip file should contain the .pth model file and an optional .index file.
98
+
99
+ Once the 2 input fields are filled in, simply click `Download`! Once the output message says `[NAME] Model successfully downloaded!`, you should be able to use it in the `Generate song covers` tab!
100
+
101
+ #### Upload models
102
+
103
+ ![](images/webui_upload_model.png?raw=true)
104
+
105
+ For people who have trained RVC v2 models locally and would like to use them for AI cover generations.
106
+ Navigate to the `Upload model` subtab under the `Manage models` tab, and follow the instructions.
107
+ Once the output message says `Model with name [NAME] successfully uploaded!`, you should be able to use it in the `Generate song covers` tab!
108
+
109
+ #### Delete RVC models
110
+
111
+ TBA
112
+
113
+ ### Generate song covers
114
+
115
+ #### One-click generation
116
+
117
+
118
+ ![](images/webui_generate.png?raw=true)
119
+
120
+ - From the Voice model dropdown menu, select the voice model to use.
121
+ - In the song input field, copy and paste the link to any song on YouTube, the full path to a local audio file, or select a cached input song.
122
+ - Pitch should be set to either -12, 0, or 12 depending on the original vocals and the RVC AI modal. This ensures the voice is not *out of tune*.
123
+ - Other advanced options for vocal conversion, audio mixing and etc. can be viewed by clicking the appropriate accordion arrow to expand.
124
+
125
+ Once all options are filled in, click `Generate` and the AI generated cover should appear in a less than a few minutes depending on your GPU.
126
+
127
+ #### Multi-step generation
128
+ TBA
129
+
130
+ <!-- ## CLI
131
+ TBA -->
132
+
133
+ ## Update to latest version
134
+
135
+ Run the following command to pull latest changes from the repository and reinstall dependencies.
136
+ Note that the process may take upwards of 5 minutes.
137
+ #### Windows
138
+
139
+ ```
140
+ ./urvc.bat update
141
+ ```
142
+
143
+ #### Linux (Debian-based)
144
+
145
+ ```
146
+ ./urvc.sh update
147
+ ```
148
+
149
+ ## Development mode
150
+
151
+ When developing new features or debugging, it is recommended to run the app in development mode. This enables hot reloading, which means that the app will automatically reload when changes are made to the code.
152
+
153
+ #### Windows
154
+
155
+ ```
156
+ ./urvc.bat dev
157
+ ```
158
+
159
+ #### Linux (Debian-based)
160
+
161
+ ```
162
+ ./urvc.sh dev
163
+ ```
164
+
165
+
166
+ ## Terms of Use
167
+
168
+ The use of the converted voice for the following purposes is prohibited.
169
+
170
+ * Criticizing or attacking individuals.
171
+
172
+ * Advocating for or opposing specific political positions, religions, or ideologies.
173
+
174
+ * Publicly displaying strongly stimulating expressions without proper zoning.
175
+
176
+ * Selling of voice models and generated voice clips.
177
+
178
+ * Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.
179
+
180
+ * Fraudulent purposes that lead to identity theft or fraudulent phone calls.
181
+
182
+ ## Disclaimer
183
+
184
+ I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.
images/webui_dl_model.png ADDED
images/webui_generate.png ADDED
images/webui_upload_model.png ADDED
models/rvc/MODELS.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ RVC Models can be added as a folder here. Each folder should contain the model file (.pth extension), and an index file (.index extension).
2
+ For example, a folder called Maya, containing 2 files, Maya.pth and added_IVF1905_Flat_nprobe_Maya_v2.index.
models/rvc/public_models.json ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tags": {
3
+ "English": "Character speaks English",
4
+ "Japanese": "Character speaks Japanese",
5
+ "Other Language": "The character speaks Other Language",
6
+ "Anime": "Character from anime",
7
+ "Vtuber": "Character is a vtuber",
8
+ "Real person": "A person who exists in the real world",
9
+ "Game character": "A character from the game"
10
+ },
11
+ "voice_models": [
12
+ {
13
+ "name": "Emilia",
14
+ "url": "https://huggingface.co/RinkaEmina/RVC_Sharing/resolve/main/Emilia%20V2%2048000.zip",
15
+ "description": "Emilia from Re:Zero",
16
+ "added": "2023-07-31",
17
+ "credit": "rinka4759",
18
+ "tags": [
19
+ "Anime"
20
+ ]
21
+ },
22
+ {
23
+ "name": "Klee",
24
+ "url": "https://huggingface.co/qweshkka/Klee/resolve/main/Klee.zip",
25
+ "description": "Klee from Genshin Impact",
26
+ "added": "2023-07-31",
27
+ "credit": "qweshsmashjuicefruity",
28
+ "tags": [
29
+ "Game character",
30
+ "Japanese"
31
+ ]
32
+ },
33
+ {
34
+ "name": "Yelan",
35
+ "url": "https://huggingface.co/iroaK/RVC2_Yelan_GenshinImpact/resolve/main/YelanJP.zip",
36
+ "description": "Yelan from Genshin Impact",
37
+ "added": "2023-07-31",
38
+ "credit": "iroak",
39
+ "tags": [
40
+ "Game character",
41
+ "Japanese"
42
+ ]
43
+ },
44
+ {
45
+ "name": "Yae Miko",
46
+ "url": "https://huggingface.co/iroaK/RVC2_YaeMiko_GenshinImpact/resolve/main/Yae_MikoJP.zip",
47
+ "description": "Yae Miko from Genshin Impact",
48
+ "added": "2023-07-31",
49
+ "credit": "iroak",
50
+ "tags": [
51
+ "Game character",
52
+ "Japanese"
53
+ ]
54
+ },
55
+ {
56
+ "name": "Lisa",
57
+ "url": "https://huggingface.co/qweshkka/Lisa2ver/resolve/main/Lisa.zip",
58
+ "description": "Lisa from Genshin Impact",
59
+ "added": "2023-07-31",
60
+ "credit": "qweshsmashjuicefruity",
61
+ "tags": [
62
+ "Game character",
63
+ "English"
64
+ ]
65
+ },
66
+ {
67
+ "name": "Kazuha",
68
+ "url": "https://huggingface.co/iroaK/RVC2_Kazuha_GenshinImpact/resolve/main/Kazuha.zip",
69
+ "description": "Kaedehara Kazuha from Genshin Impact",
70
+ "added": "2023-07-31",
71
+ "credit": "iroak",
72
+ "tags": [
73
+ "Game character",
74
+ "Japanese"
75
+ ]
76
+ },
77
+ {
78
+ "name": "Barbara",
79
+ "url": "https://huggingface.co/iroaK/RVC2_Barbara_GenshinImpact/resolve/main/BarbaraJP.zip",
80
+ "description": "Barbara from Genshin Impact",
81
+ "added": "2023-07-31",
82
+ "credit": "iroak",
83
+ "tags": [
84
+ "Game character",
85
+ "Japanese"
86
+ ]
87
+ },
88
+ {
89
+ "name": "Tom Holland",
90
+ "url": "https://huggingface.co/TJKAI/TomHolland/resolve/main/TomHolland.zip",
91
+ "description": "Tom Holland (Spider-Man)",
92
+ "added": "2023-08-03",
93
+ "credit": "tjkcreative",
94
+ "tags": [
95
+ "Real person",
96
+ "English"
97
+ ]
98
+ },
99
+ {
100
+ "name": "Kamisato Ayaka",
101
+ "url": "https://huggingface.co/benitheworld/ayaka-cn/resolve/main/ayaka-cn.zip",
102
+ "description": "Kamisato Ayaka from Genshin Impact - CN voice actor",
103
+ "added": "2023-08-03",
104
+ "credit": "kannysoap",
105
+ "tags": [
106
+ "Game character",
107
+ "Other Language"
108
+ ]
109
+ },
110
+ {
111
+ "name": "Amai Odayaka",
112
+ "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Amai-Odayaka.zip",
113
+ "description": "Amai Odayaka from Yandere Simulator",
114
+ "added": "2023-08-03",
115
+ "credit": "minecraftian47",
116
+ "tags": [
117
+ "Anime",
118
+ "English"
119
+ ]
120
+ },
121
+ {
122
+ "name": "Compa - Hyperdimension Neptunia",
123
+ "url": "https://huggingface.co/zeerowiibu/WiibuRVCCollection/resolve/main/Compa%20(Choujigen%20Game%20Neptunia)%20(JPN)%20(RVC%20v2)%20(150%20Epochs).zip",
124
+ "description": "Compa from Choujigen Game Neptune (aka Hyperdimension Neptunia)",
125
+ "added": "2023-08-03",
126
+ "credit": "zeerowiibu",
127
+ "tags": [
128
+ "Anime",
129
+ "Japanese"
130
+ ]
131
+ },
132
+ {
133
+ "name": "Fu Xuan",
134
+ "url": "https://huggingface.co/Juneuarie/FuXuan/resolve/main/FuXuan.zip",
135
+ "description": "Fu Xuan from Honkai Star Rail (HSR)",
136
+ "added": "2023-08-03",
137
+ "credit": "__june",
138
+ "tags": [
139
+ "Game character",
140
+ "English"
141
+ ]
142
+ },
143
+ {
144
+ "name": "Xinyan",
145
+ "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/XinyanRVC.zip",
146
+ "description": "Xinyan from Genshin Impact",
147
+ "added": "2023-08-03",
148
+ "credit": "shyelijah",
149
+ "tags": [
150
+ "Game character",
151
+ "English"
152
+ ]
153
+ },
154
+ {
155
+ "name": "Enterprise",
156
+ "url": "https://huggingface.co/NoIdea4Username/NoIdeaRVCCollection/resolve/main/Enterprise-JP.zip",
157
+ "description": "Enterprise from Azur Lane",
158
+ "added": "2023-08-03",
159
+ "credit": "minecraftian47",
160
+ "tags": [
161
+ "Anime",
162
+ "Japanese"
163
+ ]
164
+ },
165
+ {
166
+ "name": "Kurt Cobain",
167
+ "url": "https://huggingface.co/Florstie/Kurt_Cobain_byFlorst/resolve/main/Kurt_Florst.zip",
168
+ "description": "singer Kurt Cobain",
169
+ "added": "2023-08-03",
170
+ "credit": "florst",
171
+ "tags": [
172
+ "Real person",
173
+ "English"
174
+ ]
175
+ },
176
+ {
177
+ "name": "Ironmouse",
178
+ "url": "https://huggingface.co/Tempo-Hawk/IronmouseV2/resolve/main/IronmouseV2.zip",
179
+ "description": "Ironmouse",
180
+ "added": "2023-08-03",
181
+ "credit": "ladyimpa",
182
+ "tags": [
183
+ "Vtuber",
184
+ "English"
185
+ ]
186
+ },
187
+ {
188
+ "name": "Bratishkinoff",
189
+ "url": "https://huggingface.co/JHmashups/Bratishkinoff/resolve/main/bratishkin.zip",
190
+ "description": "Bratishkinoff (Bratishkin | Братишкин) - russian steamer ",
191
+ "added": "2023-08-03",
192
+ "credit": ".caddii",
193
+ "tags": [
194
+ "Real person",
195
+ "Other Language"
196
+ ]
197
+ },
198
+ {
199
+ "name": "Yagami Light",
200
+ "url": "https://huggingface.co/geekdom-tr/Yagami-Light/resolve/main/Yagami-Light.zip",
201
+ "description": "Yagami Light (Miyano Mamoru) from death note",
202
+ "added": "2023-08-03",
203
+ "credit": "takka / takka#7700",
204
+ "tags": [
205
+ "Anime",
206
+ "Japanese"
207
+ ]
208
+ },
209
+ {
210
+ "name": "Itashi",
211
+ "url": "https://huggingface.co/4uGGun/4uGGunRVC/resolve/main/itashi.zip",
212
+ "description": "Itashi (Russian fandubber AniLibria) ",
213
+ "added": "2023-08-03",
214
+ "credit": "BelochkaOff",
215
+ "tags": [
216
+ "Anime",
217
+ "Other Language",
218
+ "Real person"
219
+ ]
220
+ },
221
+ {
222
+ "name": "Michiru Kagemori",
223
+ "url": "https://huggingface.co/WolfMK/MichiruKagemori/resolve/main/MichiruKagemori_RVC_V2.zip",
224
+ "description": "Michiru Kagemori from Brand New Animal (300 Epochs)",
225
+ "added": "2023-08-03",
226
+ "credit": "wolfmk",
227
+ "tags": [
228
+ "Anime",
229
+ "English"
230
+ ]
231
+ }
232
+ ,
233
+ {
234
+ "name": "Kaeya",
235
+ "url": "https://huggingface.co/nlordqting4444/nlordqtingRVC/resolve/main/Kaeya.zip",
236
+ "description": "Kaeya (VA: Kohsuke Toriumi) from Genshin Impact (300 Epochs)",
237
+ "added": "2023-08-03",
238
+ "credit": "nlordqting4444",
239
+ "tags": [
240
+ "Game character",
241
+ "Japanese"
242
+ ]
243
+ },
244
+ {
245
+ "name": "Mona Megistus",
246
+ "url": "https://huggingface.co/AnimeSessions/rvc_voice_models/resolve/main/MonaRVC.zip",
247
+ "description": "Mona Megistus (VA: Felecia Angelle) from Genshin Impact (250 Epochs)",
248
+ "added": "2023-08-03",
249
+ "credit": "shyelijah",
250
+ "tags": [
251
+ "Game character",
252
+ "English"
253
+ ]
254
+ },
255
+ {
256
+ "name": "Klee",
257
+ "url": "https://huggingface.co/hardbop/AI_MODEL_THINGY/resolve/main/kleeeng_rvc.zip",
258
+ "description": "Klee from Genshin Impact (400 Epochs)",
259
+ "added": "2023-08-03",
260
+ "credit": "hardbop",
261
+ "tags": [
262
+ "Game character",
263
+ "English"
264
+ ]
265
+ },
266
+ {
267
+ "name": "Sakurakoji Kinako",
268
+ "url": "https://huggingface.co/Gorodogi/RVC2MangioCrepe/resolve/main/kinakobetatwo700.zip",
269
+ "description": "Sakurakoji Kinako (Suzuhara Nozomi) from Love Live! Superstar!! (700 Epoch)",
270
+ "added": "2023-08-03",
271
+ "credit": "ck1089",
272
+ "tags": [
273
+ "Anime",
274
+ "Japanese"
275
+ ]
276
+ },
277
+ {
278
+ "name": "Minamo Kurosawa",
279
+ "url": "https://huggingface.co/timothy10583/RVC/resolve/main/minamo-kurosawa.zip",
280
+ "description": "Minamo (Nyamo) Kurosawa (Azumanga Daioh US DUB) (300 Epochs)",
281
+ "added": "2023-08-03",
282
+ "credit": "timothy10583",
283
+ "tags": [
284
+ "Anime"
285
+ ]
286
+ },
287
+ {
288
+ "name": "Neco Arc",
289
+ "url": "https://huggingface.co/Ozzy-Helix/Neko_Arc_Neko_Aruku.RVCv2/resolve/main/Neko_Arc-V3-E600.zip",
290
+ "description": "Neco Arc (Neco-Aruku) (Epochs 600)",
291
+ "added": "2023-08-03",
292
+ "credit": "ozzy_helix_",
293
+ "tags": [
294
+ "Anime"
295
+ ]
296
+ },
297
+ {
298
+ "name": "Makima",
299
+ "url": "https://huggingface.co/andolei/makimaen/resolve/main/makima-en-dub.zip",
300
+ "description": "Makima from Chainsaw Man (300 Epochs)",
301
+ "added": "2023-08-03",
302
+ "credit": "andpproximately",
303
+ "tags": [
304
+ "Anime",
305
+ "English"
306
+ ]
307
+ },
308
+ {
309
+ "name": "PomPom",
310
+ "url": "https://huggingface.co/benitheworld/pom-pom/resolve/main/pom-pom.zip",
311
+ "description": "PomPom from Honkai Star Rail (HSR) (200 Epochs)",
312
+ "added": "2023-08-03",
313
+ "credit": "kannysoap",
314
+ "tags": [
315
+ "Game character",
316
+ "English"
317
+ ]
318
+ },
319
+ {
320
+ "name": "Asuka Langley Soryu",
321
+ "url": "https://huggingface.co/Piegirl/asukaadv/resolve/main/asuka.zip",
322
+ "description": "Asuka Langley Soryu/Tiffany Grant from Neon Genesis Evangelion (400 Epochs)",
323
+ "added": "2023-08-03",
324
+ "credit": "piegirl",
325
+ "tags": [
326
+ "Anime",
327
+ "English"
328
+ ]
329
+ },
330
+ {
331
+ "name": "Ochaco Uraraka",
332
+ "url": "https://huggingface.co/legitdark/JP-Uraraka-By-Dan/resolve/main/JP-Uraraka-By-Dan.zip",
333
+ "description": "Ochaco Uraraka from Boku no Hero Academia (320 Epochs)",
334
+ "added": "2023-08-03",
335
+ "credit": "danthevegetable",
336
+ "tags": [
337
+ "Anime",
338
+ "Japanese"
339
+ ]
340
+ },
341
+ {
342
+ "name": "Sunaokami Shiroko",
343
+ "url": "https://huggingface.co/LordDavis778/BlueArchivevoicemodels/resolve/main/SunaokamiShiroko.zip",
344
+ "description": "Sunaokami Shiroko from Blue Archive (500 Epochs)",
345
+ "added": "2023-08-03",
346
+ "credit": "lorddavis778",
347
+ "tags": [
348
+ "Anime"
349
+ ]
350
+ },
351
+ {
352
+ "name": "Dainsleif",
353
+ "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Dainsleif/Dainsleif.zip",
354
+ "description": "Dainsleif from Genshin Impact (335 Epochs)",
355
+ "added": "2023-08-03",
356
+ "credit": "nasley",
357
+ "tags": [
358
+ "Game character",
359
+ "English"
360
+ ]
361
+ },
362
+ {
363
+ "name": "Mae Asmr",
364
+ "url": "https://huggingface.co/ctian/VRC/resolve/main/MaeASMR.zip",
365
+ "description": "Mae Asmr - harvest mommy voice (YOUTUBE) (300 Epochs)",
366
+ "added": "2023-08-03",
367
+ "credit": "ctian_04",
368
+ "tags": [
369
+ "English",
370
+ "Real person",
371
+ "Vtuber"
372
+ ]
373
+ },
374
+ {
375
+ "name": "Hana Shirosaki ",
376
+ "url": "https://huggingface.co/Pawlik17/HanaWataten/resolve/main/HanaWATATEN.zip",
377
+ "description": "Hana Shirosaki / 白 咲 花 From Watashi ni Tenshi ga Maiorita! (570 Epochs)",
378
+ "added": "2023-08-03",
379
+ "credit": "tamalik",
380
+ "tags": [
381
+ "Anime",
382
+ "Japanese"
383
+ ]
384
+ },
385
+ {
386
+ "name": "Kaguya Shinomiya ",
387
+ "url": "https://huggingface.co/1ski/1skiRVCModels/resolve/main/kaguyav5.zip",
388
+ "description": "Kaguya Shinomiya from Kaguya-Sama Love is war (200 Epochs)",
389
+ "added": "2023-08-03",
390
+ "credit": "1ski",
391
+ "tags": [
392
+ "Anime",
393
+ "Japanese"
394
+ ]
395
+ },
396
+ {
397
+ "name": "Nai Shiro",
398
+ "url": "https://huggingface.co/kuushiro/Shiro-RVC-No-Game-No-Life/resolve/main/shiro-jp-360-epochs.zip",
399
+ "description": "Nai Shiro (Ai Kayano) from No Game No Life (360 Epochs)",
400
+ "added": "2023-08-03",
401
+ "credit": "kxouyou",
402
+ "tags": [
403
+ "Anime",
404
+ "Japanese"
405
+ ]
406
+ },
407
+ {
408
+ "name": "Yuigahama Yui",
409
+ "url": "https://huggingface.co/Zerokano/Yuigahama_Yui-RVCv2/resolve/main/Yuigahama_Yui.zip",
410
+ "description": "Yuigahama Yui from Yahari Ore no Seishun Love Comedy wa Machigatteiru (250 Epochs)",
411
+ "added": "2023-08-03",
412
+ "credit": "zerokano",
413
+ "tags": [
414
+ "Anime",
415
+ "Japanese"
416
+ ]
417
+ },
418
+ {
419
+ "name": "Fuwawa Abyssgard",
420
+ "url": "https://huggingface.co/megaaziib/my-rvc-models-collection/resolve/main/fuwawa.zip",
421
+ "description": "Fuwawa Abyssgard (FUWAMOCO) from Hololive gen 3 (250 Epochs)",
422
+ "added": "2023-08-03",
423
+ "credit": "megaaziib",
424
+ "tags": [
425
+ "Vtuber",
426
+ "English"
427
+ ]
428
+ },
429
+ {
430
+ "name": "Kana Arima",
431
+ "url": "https://huggingface.co/ddoumakunn/arimakanna/resolve/main/arimakanna.zip",
432
+ "description": "Kana Arima from Oshi no Ko (250 Epochs)",
433
+ "added": "2023-08-03",
434
+ "credit": "ddoumakunn",
435
+ "tags": [
436
+ "Anime",
437
+ "Japanese"
438
+ ]
439
+ },
440
+ {
441
+ "name": "Raiden Shogun",
442
+ "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/RaidenShogun/RaidenShogun.zip",
443
+ "description": "Raiden Shogun from Genshin Impact (310 Epochs)",
444
+ "added": "2023-08-03",
445
+ "credit": "nasley",
446
+ "tags": [
447
+ "Game character",
448
+ "English"
449
+ ]
450
+ },
451
+ {
452
+ "name": "Alhaitham",
453
+ "url": "https://huggingface.co/Nasleyy/NasleyRVC/resolve/main/Voices/Alhaitham/Alhaitham.zip",
454
+ "description": "Alhaitham from Genshin Impact (320 Epochs)",
455
+ "added": "2023-08-03",
456
+ "credit": "nasley",
457
+ "tags": [
458
+ "Game character",
459
+ "English"
460
+ ]
461
+ },
462
+ {
463
+ "name": "Izuku Midoriya",
464
+ "url": "https://huggingface.co/BigGuy635/MHA/resolve/main/DekuJP.zip",
465
+ "description": "Izuku Midoriya from Boku no Hero Academia (100 Epochs)",
466
+ "added": "2023-08-03",
467
+ "credit": "khjjnoffical",
468
+ "tags": [
469
+ "Anime",
470
+ "Japanese"
471
+ ]
472
+ },
473
+ {
474
+ "name": "Kurumi Shiratori",
475
+ "url": "https://huggingface.co/HarunaKasuga/YoshikoTsushima/resolve/main/KurumiShiratori.zip",
476
+ "description": "Kurumi Shiratori (VA: Ruka Fukagawa) from D4DJ (500 Epochs)",
477
+ "added": "2023-08-03",
478
+ "credit": "seakrait",
479
+ "tags": [
480
+ "Anime",
481
+ "Japanese"
482
+ ]
483
+ },
484
+ {
485
+ "name": "Veibae",
486
+ "url": "https://huggingface.co/datasets/Papaquans/Veibae/resolve/main/veibae_e165_s125565.zip",
487
+ "description": "Veibae (165 Epochs)",
488
+ "added": "2023-08-03",
489
+ "credit": "recairo",
490
+ "tags": [
491
+ "Vtuber",
492
+ "English"
493
+ ]
494
+ },
495
+ {
496
+ "name": "Black Panther",
497
+ "url": "https://huggingface.co/TJKAI/BlackPannther/resolve/main/BlackPanther.zip",
498
+ "description": "Black Panther (Chadwick Boseman) (300 Epochs)",
499
+ "added": "2023-08-03",
500
+ "credit": "tjkcreative",
501
+ "tags": [
502
+ "Real person",
503
+ "English"
504
+ ]
505
+ },
506
+ {
507
+ "name": "Gawr Gura",
508
+ "url": "https://pixeldrain.com/u/3tJmABXA",
509
+ "description": "Gawr Gura from Hololive EN",
510
+ "added": "2023-08-05",
511
+ "credit": "dacoolkid44 & hijack",
512
+ "tags": [
513
+ "Vtuber"
514
+ ]
515
+ },
516
+ {
517
+ "name": "Houshou Marine",
518
+ "url": "https://pixeldrain.com/u/L1YLfZyU",
519
+ "description": "Houshou Marine from Hololive JP",
520
+ "added": "2023-08-05",
521
+ "credit": "dacoolkid44 & hijack",
522
+ "tags": [
523
+ "Vtuber",
524
+ "Japanese"
525
+ ]
526
+ },
527
+ {
528
+ "name": "Hoshimachi Suisei",
529
+ "url": "https://pixeldrain.com/u/YP89C21u",
530
+ "description": "Hoshimachi Suisei from Hololive JP",
531
+ "added": "2023-08-05",
532
+ "credit": "dacoolkid44 & hijack & Maki Ligon",
533
+ "tags": [
534
+ "Vtuber",
535
+ "Japanese"
536
+ ]
537
+ },
538
+ {
539
+ "name": "Laplus Darkness",
540
+ "url": "https://pixeldrain.com/u/zmuxv5Bf",
541
+ "description": "Laplus Darkness from Hololive JP",
542
+ "added": "2023-08-05",
543
+ "credit": "dacoolkid44 & hijack",
544
+ "tags": [
545
+ "Vtuber",
546
+ "Japanese"
547
+ ]
548
+ },
549
+ {
550
+ "name": "AZKi",
551
+ "url": "https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models/resolve/main/AZKi%20(Hybrid).zip",
552
+ "description": "AZKi from Hololive JP",
553
+ "added": "2023-08-05",
554
+ "credit": "Kit Lemonfoot / NSHFB",
555
+ "tags": [
556
+ "Vtuber",
557
+ "Japanese"
558
+ ]
559
+ },
560
+ {
561
+ "name": "Ado",
562
+ "url": "https://huggingface.co/pjesek/AdoRVCv2/resolve/main/AdoRVCv2.zip",
563
+ "description": "Talented JP artist (500 epochs using every song from her first album)",
564
+ "added": "2023-08-05",
565
+ "credit": "pjesek",
566
+ "tags": [
567
+ "Real person",
568
+ "Japanese"
569
+ ]
570
+ },
571
+ {
572
+ "name": "LiSA",
573
+ "url": "https://huggingface.co/phant0m4r/LiSA/resolve/main/LiSA.zip",
574
+ "description": "Talented JP artist (400 epochs)",
575
+ "added": "2023-08-05",
576
+ "credit": "Phant0m",
577
+ "tags": [
578
+ "Real person",
579
+ "Japanese"
580
+ ]
581
+ },
582
+ {
583
+ "name": "Kokomi",
584
+ "url": "https://huggingface.co/benitheworld/kokomi-kr/resolve/main/kokomi-kr.zip",
585
+ "description": "Kokomi from Genshin Impact KR (300 Epochs)",
586
+ "added": "2023-08-09",
587
+ "credit": "kannysoap",
588
+ "tags": [
589
+ "Game character",
590
+ "Other Language"
591
+ ]
592
+ },
593
+ {
594
+ "name": "Ivanzolo",
595
+ "url": "https://huggingface.co/fenikkusugosuto/IvanZolo2004/resolve/main/ivanZolo.zip",
596
+ "description": "Ivanzolo2004 russian streamer | Иван Золо 2004",
597
+ "added": "2023-08-09",
598
+ "credit": "prezervativ_naruto2009",
599
+ "tags": [
600
+ "Other Language",
601
+ "Real person"
602
+ ]
603
+ },
604
+ {
605
+ "name": "Nilou",
606
+ "url": "https://huggingface.co/benitheworld/nilou-kr/resolve/main/nilou-kr.zip",
607
+ "description": "Nilou from Genshin Impact KR (300 Epochs)",
608
+ "added": "2023-08-09",
609
+ "credit": "kannysoap",
610
+ "tags": [
611
+ "Game character",
612
+ "Other Language"
613
+ ]
614
+ },
615
+ {
616
+ "name": "Dr. Doofenshmirtz",
617
+ "url": "https://huggingface.co/Argax/doofenshmirtz-RUS/resolve/main/doofenshmirtz.zip",
618
+ "description": "RUS Dr. Doofenshmirtz from Phineas and Ferb (300 epochs)",
619
+ "added": "2023-08-09",
620
+ "credit": "argaxus",
621
+ "tags": [
622
+ "Other Language"
623
+ ]
624
+ }
625
+ ]
626
+ }
notebooks/ultimate_rvc_colab.ipynb ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "kmyCzJVyCymN"
7
+ },
8
+ "source": [
9
+ "Colab for [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)\n",
10
+ "\n",
11
+ "This Colab notebook will **help** you if you don’t have a GPU or if your PC isn’t very powerful.\n",
12
+ "\n",
13
+ "Simply click `Runtime` in the top navigation bar and `Run all`. Wait for the output of the final cell to show the public gradio url and click on it.\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": null,
19
+ "metadata": {
20
+ "cellView": "form",
21
+ "id": "TfYDhnzOyig5"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "#@title 0: Initialize notebook\n",
26
+ "from IPython.display import clear_output\n",
27
+ "import threading\n",
28
+ "import time\n",
29
+ "import codecs\n",
30
+ "\n",
31
+ "DEPENDENCIES_PATH = \"./dependencies\"\n",
32
+ "VENV_PATH = f\"{DEPENDENCIES_PATH}/venv\"\n",
33
+ "BIN_PATH = f\"{VENV_PATH}/bin\"\n",
34
+ "\n",
35
+ "\n",
36
+ "def update_timer_and_print():\n",
37
+ " global timer\n",
38
+ " while True:\n",
39
+ " hours, remainder = divmod(timer, 3600)\n",
40
+ " minutes, seconds = divmod(remainder, 60)\n",
41
+ " timer_str = f'{hours:02}:{minutes:02}:{seconds:02}'\n",
42
+ " print(f'\\rTimer: {timer_str} ', end='', flush=True) # Print without a newline\n",
43
+ " time.sleep(1)\n",
44
+ " timer += 1\n",
45
+ "\n",
46
+ "timer = 0\n",
47
+ "threading.Thread(target=update_timer_and_print, daemon=True).start()\n",
48
+ "\n",
49
+ "install_to_drive=False\n",
50
+ "if install_to_drive==True:\n",
51
+ " from google.colab import drive\n",
52
+ " drive.mount('/content/drive')\n",
53
+ " %cd /content/drive/MyDrive\n",
54
+ "else:\n",
55
+ " %cd /content\n",
56
+ "clear_output()"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {
63
+ "cellView": "form",
64
+ "id": "aaokDv1VzpAX"
65
+ },
66
+ "outputs": [],
67
+ "source": [
68
+ "#@title 1: Clone repository\n",
69
+ "cloneing=codecs.decode('uggcf://tvguho.pbz/WnpxvfzlFurcuneq/hygvzngr-eip.tvg','rot_13')\n",
70
+ "\n",
71
+ "!git clone $cloneing HRVC\n",
72
+ "%cd HRVC\n",
73
+ "clear_output()"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "metadata": {
80
+ "cellView": "form",
81
+ "id": "lVGNygIa0F_1"
82
+ },
83
+ "outputs": [],
84
+ "source": [
85
+ "#@title 2: Install dependencies\n",
86
+ "inits = codecs.decode('./fep/vavg.cl','rot_13')\n",
87
+ "curly = codecs.decode(\"uggcf://uhttvatsnpr.pb/WnpxvfzlFurcuneq/hygvzngr-eip/erfbyir/znva/snvefrd-0.12.2-pc311-pc311-yvahk_k86_64.juy\",\"rot_13\")\n",
88
+ "destiny = codecs.decode('snvefrd-0.12.2-pc311-pc311-yvahk_k86_64.juy','rot_13')\n",
89
+ "\n",
90
+ "!apt install -y python3.11 python3.11-dev python3.11-venv\n",
91
+ "!apt install -y sox libsox-dev ffmpeg\n",
92
+ "\n",
93
+ "!curl -LJ -o $DEPENDENCIES_PATH/$destiny --create-dirs $curly\n",
94
+ "!python3.11 -m venv $VENV_PATH --upgrade-deps\n",
95
+ "\n",
96
+ "! $BIN_PATH/pip install -r requirements.txt\n",
97
+ "! $BIN_PATH/pip install faiss-cpu==1.7.3\n",
98
+ "! $BIN_PATH/python $inits\n",
99
+ "clear_output()"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {
106
+ "cellView": "form",
107
+ "id": "lVGNygIa0F_2"
108
+ },
109
+ "outputs": [],
110
+ "source": [
111
+ "#@title 3: Run Ultimate RVC\n",
112
+ "runpice = codecs.decode('./fep/ncc.cl','rot_13')\n",
113
+ "\n",
114
+ "!$BIN_PATH/python $runpice --share --listen-port 9999"
115
+ ]
116
+ }
117
+ ],
118
+ "metadata": {
119
+ "accelerator": "GPU",
120
+ "colab": {
121
+ "gpuType": "T4",
122
+ "provenance": []
123
+ },
124
+ "kernelspec": {
125
+ "display_name": "Python 3",
126
+ "name": "python3"
127
+ },
128
+ "language_info": {
129
+ "name": "python"
130
+ }
131
+ },
132
+ "nbformat": 4,
133
+ "nbformat_minor": 0
134
+ }
pyproject.toml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.pyright]
2
+ stubPath = "src/typings"
3
+ pythonVersion = "3.11"
4
+ pythonPlatform = "All"
5
+ typeCheckingMode = "strict"
6
+ ignore = ["**/.venv"]
7
+
8
+ [tool.black]
9
+ target-version = ['py311']
10
+ preview = true
11
+ enable-unstable-feature = ["string_processing"]
12
+
13
+ [tool.ruff]
14
+ extend-include = ["*.ipynb"]
15
+ target-version = "py311"
16
+ fix = true
17
+ required-version = ">=0.5.7"
18
+
19
+ [tool.ruff.format]
20
+ docstring-code-format = true
21
+
22
+ [tool.ruff.lint]
23
+ #select = ["ALL"]
24
+ extend-select = ["I"]
25
+ ignore = ["D205", "D203", "D212", "D416"]
26
+ unfixable = ["F401"]
27
+ preview = true
28
+
29
+ [tool.ruff.lint.flake8-annotations]
30
+ ignore-fully-untyped = true
31
+ #suppress-none-returning = true
32
+
33
+ [tool.ruff.lint.flake8-errmsg]
34
+ #max-string-length = 20
35
+
36
+ [tool.ruff.lint.isort]
37
+ relative-imports-order = "closest-to-furthest"
38
+ section-order = [
39
+ "future",
40
+ "typing",
41
+ "standard-library",
42
+ "third-party",
43
+ "networking",
44
+ "data-science",
45
+ "machine-learning",
46
+ "audio",
47
+ "visualisation",
48
+ "first-party",
49
+ "vc",
50
+ "backend",
51
+ "frontend",
52
+ "base",
53
+ "local-folder",
54
+ ]
55
+
56
+ [tool.ruff.lint.isort.sections]
57
+ "typing" = ["typing", "typing_extensions", "typings"]
58
+ "networking" = [
59
+ "requests",
60
+ "yt_dlp",
61
+ "deemix",
62
+ "wget",
63
+ "flask",
64
+ "beautifulsoup4",
65
+ "pypresence",
66
+ ]
67
+ "data-science" = ["numpy", "scipy", "matplotlib", "tqdm", "pandas", "gradio"]
68
+ "machine-learning" = [
69
+ "torch",
70
+ "torchaudio",
71
+ "torchcrepe",
72
+ "fairseq",
73
+ "faiss",
74
+ "tensorboard",
75
+ "torchfcpe",
76
+ "local_attention",
77
+ "libf0",
78
+ "einops",
79
+ "numba",
80
+ ]
81
+ "audio" = [
82
+ "ffmpeg",
83
+ "soundfile",
84
+ "librosa",
85
+ "sox",
86
+ "pydub",
87
+ "pedalboard",
88
+ "audio_separator",
89
+ "parselmouth",
90
+ "pyworld",
91
+ "noisereduce",
92
+ "audio_upscaler",
93
+ "edge_tts",
94
+ "ffmpy",
95
+ ]
96
+ "vc" = ["vc"]
97
+ "backend" = ["backend"]
98
+ "frontend" = ["frontend"]
99
+ "base" = ["common", "app", "cli", "init"]
100
+
101
+ [tool.ruff.lint.pycodestyle]
102
+ max-doc-length = 72
103
+
104
+ [tool.ruff.lint.pylint]
105
+ # max-args = 10
requirements.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # General
2
+ lib==4.0.0
3
+
4
+ # Networking
5
+ requests==2.32.3 #NOTE upgraded from 2.32.0
6
+ yt_dlp==2024.8.6
7
+ #TODO add these later
8
+ # deemix
9
+ # wget
10
+ # flask
11
+ # beautifulsoup4
12
+ # pypresence
13
+
14
+ # Data science
15
+ numpy==1.23.5
16
+ scipy~=1.13.0 # NOTE upgraded from 1.11.1
17
+ matplotlib==3.9.0 #NOTE upgraded from 3.7.2
18
+ tqdm==4.65.0 #NOTE upgraded from unspecified
19
+ gradio==4.43.0
20
+
21
+ # Machine learning
22
+ --find-links https://download.pytorch.org/whl/torch_stable.html
23
+ torch==2.1.1+cu121 # NOTE upgraded from 2.0.1+cu118
24
+ torchaudio==2.1.1+cu121
25
+ torchcrepe==0.0.23 # NOTE upgraded from 0.0.20
26
+ ./dependencies/fairseq-0.12.2-cp311-cp311-linux_x86_64.whl; sys_platform == 'linux'
27
+ ./dependencies/fairseq-0.12.3.1-cp311-cp311-win_amd64.whl; sys_platform == 'win32'
28
+ ./dependencies/diffq-0.2.4-cp311-cp311-win_amd64.whl; sys_platform == 'win32'
29
+ tensorboardX
30
+ #TODO add these later
31
+ # faiss-cpu==1.7.3 # NOTE outcommented due to incompatibility on windows
32
+ # tensorboard
33
+ # torchfcpe
34
+ # local-attention
35
+ # libf0
36
+ # einops
37
+ # numba; sys_platform == 'linux'
38
+ # numba==0.57.0; sys_platform == 'darwin' or sys_platform == 'win32'
39
+
40
+ # Audio
41
+ ffmpeg-python>=0.2.0
42
+ soundfile==0.12.1
43
+ librosa >=0.10 # NOTE upgraded from 0.9.2
44
+ sox==1.5.0
45
+ pydub==0.25.1
46
+ pydub-stubs
47
+ pedalboard==0.9.12
48
+ audio-separator[gpu]==0.18.3
49
+ praat-parselmouth>=0.4.2 # NOTE upgraded from unspecified
50
+ pyworld==0.3.4
51
+ #TODO add the later
52
+ # noisereduce
53
+ # audio_upscaler==0.1.4
54
+ # edge-tts==6.1.9
55
+ # ffmpy==0.3.1
src/app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main application for the Ultimate RVC project.
3
+
4
+ Each tab of the application is defined in a separate module
5
+ in the `frontend/tabs` directory.
6
+
7
+ Components that are accessed across multiple tabs are passed as arguments
8
+ to the render functions in the respective modules.
9
+ """
10
+
11
+ import asyncio
12
+ import os
13
+ from argparse import ArgumentParser
14
+
15
+ import gradio as gr
16
+
17
+ from backend.generate_song_cover import get_named_song_dirs
18
+ from backend.manage_audio import delete_gradio_temp_dir, get_output_audio
19
+ from backend.manage_voice_models import get_current_models
20
+
21
+ from frontend.tabs.manage_audio import render as render_manage_audio_tab
22
+ from frontend.tabs.manage_models import render as render_manage_models_tab
23
+ from frontend.tabs.multi_step_generation import render as render_multi_step_tab
24
+ from frontend.tabs.one_click_generation import render as render_one_click_tab
25
+
26
+ from common import GRADIO_TEMP_DIR
27
+
28
+
29
+ def _init_app() -> tuple[gr.Dropdown, ...]:
30
+ """
31
+ Initialize app by deleting any existing Gradio temp directory
32
+ and updating the choices of all dropdowns.
33
+
34
+ Returns
35
+ -------
36
+ tuple[gr.Dropdown, ...]
37
+ Updated dropdowns for selecting voice models, song directories,
38
+ and output audio files.
39
+ """
40
+ delete_gradio_temp_dir()
41
+ updated_rvc_model_dropdowns = tuple(
42
+ gr.Dropdown(choices=get_current_models()) for _ in range(3)
43
+ )
44
+ updated_song_dir_dropdowns = tuple(
45
+ gr.Dropdown(choices=get_named_song_dirs()) for _ in range(10)
46
+ )
47
+ updated_output_audio_dropdown = (gr.Dropdown(choices=get_output_audio()),)
48
+ return (
49
+ updated_rvc_model_dropdowns
50
+ + updated_song_dir_dropdowns
51
+ + updated_output_audio_dropdown
52
+ )
53
+
54
+
55
+ os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
56
+
57
+ if os.name == "nt":
58
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
59
+
60
+ with gr.Blocks(title="Ultimate RVC") as app:
61
+
62
+ gr.Label("Ultimate RVC ❤️", show_label=False)
63
+
64
+ dummy_deletion_checkbox = gr.Checkbox(visible=False)
65
+ delete_confirmation = gr.State(False)
66
+ song_dir_dropdowns = [
67
+ gr.Dropdown(
68
+ label="Song directory",
69
+ info=(
70
+ "Directory where intermediate audio files are stored and loaded from"
71
+ " locally. When a new song is retrieved, its directory is chosen by"
72
+ " default."
73
+ ),
74
+ render=False,
75
+ )
76
+ for _ in range(7)
77
+ ]
78
+ cached_input_songs_dropdown_1click, cached_input_songs_dropdown_multi = [
79
+ gr.Dropdown(
80
+ label="Song input",
81
+ info="Select a song from the list of cached songs.",
82
+ visible=False,
83
+ render=False,
84
+ )
85
+ for _ in range(2)
86
+ ]
87
+ intermediate_audio_to_delete = gr.Dropdown(
88
+ label="Songs with intermediate audio files",
89
+ multiselect=True,
90
+ info=(
91
+ "Select one or more songs to delete their asssociated intermediate audio"
92
+ " files."
93
+ ),
94
+ render=False,
95
+ )
96
+ output_audio_to_delete = gr.Dropdown(
97
+ label="Output audio files",
98
+ multiselect=True,
99
+ info="Select one or more output audio files to delete.",
100
+ render=False,
101
+ )
102
+ rvc_model_1click, rvc_model_multi = [
103
+ gr.Dropdown(label="Voice model", render=False) for _ in range(2)
104
+ ]
105
+ rvc_models_to_delete = gr.Dropdown(
106
+ label="Voice models", multiselect=True, render=False
107
+ )
108
+
109
+ generate_buttons = [
110
+ gr.Button(label, variant="primary", render=False, scale=scale)
111
+ for label, scale, in [
112
+ ("Retrieve song", 1),
113
+ ("Separate vocals/instrumentals", 1),
114
+ ("Separate main/backup vocals", 1),
115
+ ("De-reverb vocals", 1),
116
+ ("Convert vocals", 1),
117
+ ("Post-process vocals", 1),
118
+ ("Pitch shift background", 1),
119
+ ("Mix song cover", 1),
120
+ ("Generate", 2),
121
+ ]
122
+ ]
123
+
124
+ # main tab
125
+ with gr.Tab("Generate song covers"):
126
+ render_one_click_tab(
127
+ generate_buttons,
128
+ song_dir_dropdowns,
129
+ cached_input_songs_dropdown_1click,
130
+ cached_input_songs_dropdown_multi,
131
+ rvc_model_1click,
132
+ intermediate_audio_to_delete,
133
+ output_audio_to_delete,
134
+ )
135
+ render_multi_step_tab(
136
+ generate_buttons,
137
+ song_dir_dropdowns,
138
+ cached_input_songs_dropdown_1click,
139
+ cached_input_songs_dropdown_multi,
140
+ rvc_model_multi,
141
+ intermediate_audio_to_delete,
142
+ output_audio_to_delete,
143
+ )
144
+ with gr.Tab("Manage models"):
145
+ render_manage_models_tab(
146
+ dummy_deletion_checkbox,
147
+ delete_confirmation,
148
+ rvc_models_to_delete,
149
+ rvc_model_1click,
150
+ rvc_model_multi,
151
+ )
152
+ with gr.Tab("Manage audio"):
153
+
154
+ render_manage_audio_tab(
155
+ dummy_deletion_checkbox,
156
+ delete_confirmation,
157
+ song_dir_dropdowns,
158
+ cached_input_songs_dropdown_1click,
159
+ cached_input_songs_dropdown_multi,
160
+ intermediate_audio_to_delete,
161
+ output_audio_to_delete,
162
+ )
163
+
164
+ app.load(
165
+ _init_app,
166
+ outputs=[
167
+ rvc_model_1click,
168
+ rvc_model_multi,
169
+ rvc_models_to_delete,
170
+ intermediate_audio_to_delete,
171
+ cached_input_songs_dropdown_1click,
172
+ cached_input_songs_dropdown_multi,
173
+ *song_dir_dropdowns,
174
+ output_audio_to_delete,
175
+ ],
176
+ show_progress="hidden",
177
+ )
178
+
179
+ app.unload(delete_gradio_temp_dir)
180
+
181
+
182
+ if __name__ == "__main__":
183
+
184
+ parser = ArgumentParser(
185
+ description="Generate a song cover song in the song_output/id directory.",
186
+ add_help=True,
187
+ )
188
+ parser.add_argument(
189
+ "--share",
190
+ action="store_true",
191
+ dest="share_enabled",
192
+ default=False,
193
+ help="Enable sharing",
194
+ )
195
+ parser.add_argument(
196
+ "--listen",
197
+ action="store_true",
198
+ default=False,
199
+ help="Make the WebUI reachable from your local network.",
200
+ )
201
+ parser.add_argument(
202
+ "--listen-host", type=str, help="The hostname that the server will use."
203
+ )
204
+ parser.add_argument(
205
+ "--listen-port", type=int, help="The listening port that the server will use."
206
+ )
207
+ args = parser.parse_args()
208
+
209
+ app.queue()
210
+ app.launch(
211
+ share=args.share_enabled,
212
+ server_name=None if not args.listen else (args.listen_host or "0.0.0.0"),
213
+ server_port=args.listen_port,
214
+ )
src/backend/common.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Common utility functions for the backend."""
2
+
3
+ from typing import Any
4
+ from typings.extra import StrOrBytesPath
5
+
6
+ import hashlib
7
+ import json
8
+ import os
9
+ import shutil
10
+
11
+ import gradio as gr
12
+
13
+ from backend.exceptions import PathNotFoundError
14
+
15
+ from common import AUDIO_DIR, RVC_MODELS_DIR
16
+
17
+ INTERMEDIATE_AUDIO_DIR = os.path.join(AUDIO_DIR, "intermediate")
18
+ OUTPUT_AUDIO_DIR = os.path.join(AUDIO_DIR, "output")
19
+
20
+
21
+ def display_progress(
22
+ message: str,
23
+ percentage: float | None = None,
24
+ progress_bar: gr.Progress | None = None,
25
+ ) -> None:
26
+ """
27
+ Display progress message and percentage in console or Gradio progress bar.
28
+
29
+ Parameters
30
+ ----------
31
+ message : str
32
+ Message to display.
33
+ percentage : float, optional
34
+ Percentage to display.
35
+ progress_bar : gr.Progress, optional
36
+ The Gradio progress bar to update.
37
+ """
38
+ if progress_bar is None:
39
+ print(message)
40
+ else:
41
+ progress_bar(percentage, desc=message)
42
+
43
+
44
+ def remove_suffix_after(text: str, occurrence: str) -> str:
45
+ """
46
+ Remove suffix after the first occurrence of a substring in a string.
47
+
48
+ Parameters
49
+ ----------
50
+ text : str
51
+ The string to remove the suffix from.
52
+ occurrence : str
53
+ The substring to remove the suffix after.
54
+
55
+ Returns
56
+ -------
57
+ str
58
+ The string with the suffix removed.
59
+ """
60
+ location = text.rfind(occurrence)
61
+ if location == -1:
62
+ return text
63
+ else:
64
+ return text[: location + len(occurrence)]
65
+
66
+
67
+ def copy_files_to_new_folder(file_paths: list[str], folder_path: str) -> None:
68
+ """
69
+ Copy files to a new folder.
70
+
71
+ Parameters
72
+ ----------
73
+ file_paths : list[str]
74
+ List of file paths to copy.
75
+ folder_path : str
76
+ Path of the folder to copy the files to.
77
+
78
+ Raises
79
+ ------
80
+ PathNotFoundError
81
+ If a file does not exist.
82
+ """
83
+ os.makedirs(folder_path)
84
+ for file_path in file_paths:
85
+ if not os.path.exists(file_path):
86
+ raise PathNotFoundError(f"File not found: {file_path}")
87
+ shutil.copyfile(
88
+ file_path, os.path.join(folder_path, os.path.basename(file_path))
89
+ )
90
+
91
+
92
+ def get_path_stem(path: str) -> str:
93
+ """
94
+ Get the stem of a file path.
95
+
96
+ The stem is the name of the file that the path points to,
97
+ not including its extension.
98
+
99
+ Parameters
100
+ ----------
101
+ path : str
102
+ The file path.
103
+
104
+ Returns
105
+ -------
106
+ str
107
+ The stem of the file path.
108
+ """
109
+ return os.path.splitext(os.path.basename(path))[0]
110
+
111
+
112
+ def json_dumps(thing: Any) -> str:
113
+ """
114
+ Dump a Python object to a JSON string.
115
+
116
+ Parameters
117
+ ----------
118
+ thing : Any
119
+ The object to dump.
120
+
121
+ Returns
122
+ -------
123
+ str
124
+ The JSON string representation of the object.
125
+ """
126
+ return json.dumps(
127
+ thing, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ": ")
128
+ )
129
+
130
+
131
+ def json_dump(thing: Any, path: StrOrBytesPath) -> None:
132
+ """
133
+ Dump a Python object to a JSON file.
134
+
135
+ Parameters
136
+ ----------
137
+ thing : Any
138
+ The object to dump.
139
+ path : str
140
+ The path of the JSON file.
141
+ """
142
+ with open(path, "w", encoding="utf-8") as file:
143
+ json.dump(
144
+ thing,
145
+ file,
146
+ ensure_ascii=False,
147
+ sort_keys=True,
148
+ indent=4,
149
+ separators=(",", ": "),
150
+ )
151
+
152
+
153
+ def json_load(path: StrOrBytesPath, encoding: str = "utf-8") -> Any:
154
+ """
155
+ Load a Python object from a JSON file.
156
+
157
+ Parameters
158
+ ----------
159
+ path : str
160
+ The path of the JSON file.
161
+ encoding : str, default='utf-8'
162
+ The encoding of the file.
163
+
164
+ Returns
165
+ -------
166
+ Any
167
+ The Python object loaded from the JSON file.
168
+ """
169
+ with open(path, encoding=encoding) as file:
170
+ return json.load(file)
171
+
172
+
173
+ def get_hash(thing: Any, size: int = 5) -> str:
174
+ """
175
+ Get a hash of a Python object.
176
+
177
+ Parameters
178
+ ----------
179
+ thing : Any
180
+ The object to hash.
181
+ size : int, default=5
182
+ The size of the hash in bytes.
183
+
184
+ Returns
185
+ -------
186
+ str
187
+ The hash of the object.
188
+ """
189
+ return hashlib.blake2b(
190
+ json_dumps(thing).encode("utf-8"), digest_size=size
191
+ ).hexdigest()
192
+
193
+
194
+ # TODO consider increasing size to 16
195
+ # otherwise we might have problems with hash collisions
196
+ def get_file_hash(filepath: StrOrBytesPath, size: int = 5) -> str:
197
+ """
198
+ Get the hash of a file.
199
+
200
+ Parameters
201
+ ----------
202
+ filepath : str
203
+ The path of the file.
204
+ size : int, default=5
205
+ The size of the hash in bytes.
206
+
207
+ Returns
208
+ -------
209
+ str
210
+ The hash of the file.
211
+ """
212
+ with open(filepath, "rb") as f:
213
+ file_hash = hashlib.file_digest(f, lambda: hashlib.blake2b(digest_size=size))
214
+ return file_hash.hexdigest()
215
+
216
+
217
+ def get_rvc_model(voice_model: str) -> tuple[str, str]:
218
+ """
219
+ Get the RVC model file and optional index file for a voice model.
220
+
221
+ When no index file exists, an empty string is returned.
222
+
223
+ Parameters
224
+ ----------
225
+ voice_model : str
226
+ The name of the voice model.
227
+
228
+ Returns
229
+ -------
230
+ model_path : str
231
+ The path of the RVC model file.
232
+ index_path : str
233
+ The path of the RVC index file.
234
+
235
+ Raises
236
+ ------
237
+ PathNotFoundError
238
+ If the directory of the voice model does not exist or
239
+ if no model file exists in the directory.
240
+ """
241
+ rvc_model_filename, rvc_index_filename = None, None
242
+ model_dir = os.path.join(RVC_MODELS_DIR, voice_model)
243
+ if not os.path.exists(model_dir):
244
+ raise PathNotFoundError(
245
+ f"Voice model directory '{voice_model}' does not exist."
246
+ )
247
+ for file in os.listdir(model_dir):
248
+ ext = os.path.splitext(file)[1]
249
+ if ext == ".pth":
250
+ rvc_model_filename = file
251
+ if ext == ".index":
252
+ rvc_index_filename = file
253
+
254
+ if rvc_model_filename is None:
255
+ raise PathNotFoundError(f"No model file exists in {model_dir}.")
256
+
257
+ return os.path.join(model_dir, rvc_model_filename), (
258
+ os.path.join(model_dir, rvc_index_filename) if rvc_index_filename else ""
259
+ )
src/backend/exceptions.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains custom exceptions that are raised by the backend.
3
+ """
4
+
5
+
6
+ class InputMissingError(ValueError):
7
+ """
8
+ Raised when an input is missing.
9
+ """
10
+
11
+ pass
12
+
13
+
14
+ class InvalidPathError(OSError):
15
+ """
16
+ Raised when a path is invalid.
17
+ """
18
+
19
+ pass
20
+
21
+
22
+ class PathNotFoundError(OSError):
23
+ """
24
+ Raised when a path is not found.
25
+ """
26
+
27
+ pass
28
+
29
+
30
+ class PathExistsError(OSError):
31
+ """
32
+ Raised when a path already exists.
33
+ """
34
+
35
+ pass
36
+
37
+
38
+ class FileTypeError(ValueError):
39
+ """
40
+ Raised when a file is of the wrong type.
41
+ """
42
+
43
+ pass
src/backend/generate_song_cover.py ADDED
@@ -0,0 +1,1679 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains functions to generate song covers using RVC-based voice models.
3
+ """
4
+
5
+ from typing import Any
6
+ from typings.extra import F0Method, InputAudioExt, InputType, OutputAudioExt
7
+
8
+ import gc
9
+ import glob
10
+ import os
11
+ import shlex
12
+ import shutil
13
+ import subprocess
14
+ from contextlib import suppress
15
+ from logging import WARNING
16
+ from pathlib import Path, PurePath
17
+ from urllib.parse import parse_qs, urlparse
18
+
19
+ import yt_dlp
20
+
21
+ import gradio as gr
22
+
23
+ import soundfile as sf
24
+ import sox
25
+ from audio_separator.separator import Separator
26
+ from pedalboard import Compressor, HighpassFilter, Reverb
27
+ from pedalboard._pedalboard import Pedalboard
28
+ from pedalboard.io import AudioFile
29
+ from pydub import AudioSegment
30
+ from pydub import utils as pydub_utils
31
+
32
+ from vc.rvc import Config, get_vc, load_hubert, rvc_infer
33
+
34
+ from backend.common import (
35
+ INTERMEDIATE_AUDIO_DIR,
36
+ OUTPUT_AUDIO_DIR,
37
+ display_progress,
38
+ get_file_hash,
39
+ get_hash,
40
+ get_path_stem,
41
+ get_rvc_model,
42
+ json_dump,
43
+ json_load,
44
+ )
45
+ from backend.exceptions import InputMissingError, InvalidPathError, PathNotFoundError
46
+
47
+ from common import RVC_MODELS_DIR, SEPARATOR_MODELS_DIR
48
+
49
+ SEPARATOR = Separator(
50
+ log_level=WARNING,
51
+ model_file_dir=SEPARATOR_MODELS_DIR,
52
+ output_dir=INTERMEDIATE_AUDIO_DIR,
53
+ mdx_params={
54
+ "hop_length": 1024,
55
+ "segment_size": 256,
56
+ "overlap": 0.001,
57
+ "batch_size": 1,
58
+ "enable_denoise": False,
59
+ },
60
+ mdxc_params={"segment_size": 256, "batch_size": 1, "overlap": 2},
61
+ )
62
+
63
+
64
+ def _get_youtube_video_id(url: str, ignore_playlist: bool = True) -> str | None:
65
+ """
66
+ Get video id from a YouTube URL.
67
+
68
+ Parameters
69
+ ----------
70
+ url : str
71
+ The YouTube URL.
72
+ ignore_playlist : bool, default=True
73
+ Whether to get id of first video in playlist or the playlist id itself.
74
+
75
+ Returns
76
+ -------
77
+ str
78
+ The video id.
79
+ """
80
+ query = urlparse(url)
81
+ if query.hostname == "youtu.be":
82
+ if query.path[1:] == "watch":
83
+ return query.query[2:]
84
+ return query.path[1:]
85
+
86
+ if query.hostname in {"www.youtube.com", "youtube.com", "music.youtube.com"}:
87
+ if not ignore_playlist:
88
+ # use case: get playlist id not current video in playlist
89
+ with suppress(KeyError):
90
+ return parse_qs(query.query)["list"][0]
91
+ if query.path == "/watch":
92
+ return parse_qs(query.query)["v"][0]
93
+ if query.path[:7] == "/watch/":
94
+ return query.path.split("/")[1]
95
+ if query.path[:7] == "/embed/":
96
+ return query.path.split("/")[2]
97
+ if query.path[:3] == "/v/":
98
+ return query.path.split("/")[2]
99
+ return None
100
+
101
+
102
+ def _yt_download(link: str, song_dir: str) -> str:
103
+ """
104
+ Download audio from a YouTube link.
105
+
106
+ Parameters
107
+ ----------
108
+ link : str
109
+ The YouTube link.
110
+ song_dir : str
111
+ The directory to save the downloaded audio to.
112
+
113
+ Returns
114
+ -------
115
+ str
116
+ The path to the downloaded audio file.
117
+ """
118
+ outtmpl = os.path.join(song_dir, "0_%(title)s_Original")
119
+ ydl_opts = {
120
+ "quiet": True,
121
+ "no_warnings": True,
122
+ "format": "bestaudio",
123
+ "outtmpl": outtmpl,
124
+ "ignoreerrors": True,
125
+ "nocheckcertificate": True,
126
+ "postprocessors": [
127
+ {
128
+ "key": "FFmpegExtractAudio",
129
+ "preferredcodec": "wav",
130
+ "preferredquality": 0,
131
+ }
132
+ ],
133
+ }
134
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
135
+ result = ydl.extract_info(link, download=True)
136
+ if not result:
137
+ raise PathNotFoundError("No audio found in the provided YouTube link!")
138
+ download_path = ydl.prepare_filename(result, outtmpl=f"{outtmpl}.wav")
139
+
140
+ return download_path
141
+
142
+
143
+ def _get_input_audio_paths() -> list[str]:
144
+ """
145
+ Get the paths of all cached input audio files.
146
+
147
+ Returns
148
+ -------
149
+ list[str]
150
+ The paths of all cached input audio files
151
+ """
152
+ # TODO if we later add .json file for input then we need to exclude those here
153
+ return glob.glob(os.path.join(INTERMEDIATE_AUDIO_DIR, "*", "0_*_Original*"))
154
+
155
+
156
+ def _get_input_audio_path(song_dir: str) -> str | None:
157
+ """
158
+ Get the path of the cached input audio file in a given song directory.
159
+
160
+ Parameters
161
+ ----------
162
+ song_dir : str
163
+ The path to a song directory.
164
+
165
+ Returns
166
+ -------
167
+ str
168
+ The path of the cached input audio file, if it exists.
169
+ """
170
+ # NOTE orig_song_paths should never contain more than one element
171
+ return next(iter(glob.glob(os.path.join(song_dir, "0_*_Original*"))), None)
172
+
173
+
174
+ def _pitch_shift(audio_path: str, output_path: str, n_semi_tones: int) -> None:
175
+ """
176
+ Pitch-shift an audio file.
177
+
178
+ Parameters
179
+ ----------
180
+ audio_path : str
181
+ The path of the audio file to pitch-shift.
182
+ output_path : str
183
+ The path to save the pitch-shifted audio file to.
184
+ n_semi_tones : int
185
+ The number of semi-tones to pitch-shift the audio by.
186
+ """
187
+ y, sr = sf.read(audio_path)
188
+ tfm = sox.Transformer()
189
+ tfm.pitch(n_semi_tones)
190
+ y_shifted = tfm.build_array(input_array=y, sample_rate_in=sr)
191
+ sf.write(output_path, y_shifted, sr)
192
+
193
+
194
+ # TODO consider increasing hash_size to 16
195
+ # otherwise we might have problems with hash collisions
196
+ # when using app as CLI
197
+ def _get_unique_base_path(
198
+ song_dir: str,
199
+ prefix: str,
200
+ arg_dict: dict[str, Any],
201
+ progress_bar: gr.Progress | None = None,
202
+ percentage: float = 0.0,
203
+ hash_size: int = 5,
204
+ ) -> str:
205
+ """
206
+ Get a unique base path for an audio file in a song directory
207
+ by hashing the arguments used to generate the audio.
208
+
209
+ Parameters
210
+ ----------
211
+ song_dir : str
212
+ The path to a song directory.
213
+ prefix : str
214
+ The prefix to use for the base path.
215
+ arg_dict : dict
216
+ The dictionary of arguments used to generate the audio in the given file.
217
+ progress_bar : gr.Progress, optional
218
+ Gradio progress bar to update.
219
+ percentage : float, default=0.0
220
+ Percentage to display in the progress bar.
221
+ hash_size : int, default=5
222
+ The size (in bytes) of the hash to use for the base path.
223
+
224
+ Returns
225
+ -------
226
+ str
227
+ The unique base path for the audio file.
228
+ """
229
+ dict_hash = get_hash(arg_dict, size=hash_size)
230
+ while True:
231
+ base_path = os.path.join(song_dir, f"{prefix}_{dict_hash}")
232
+ json_path = f"{base_path}.json"
233
+ if os.path.exists(json_path):
234
+ file_dict = json_load(json_path)
235
+ if file_dict == arg_dict:
236
+ return base_path
237
+ display_progress("[~] Rehashing...", percentage, progress_bar)
238
+ dict_hash = get_hash(dict_hash, size=hash_size)
239
+ else:
240
+ return base_path
241
+
242
+
243
+ def _convert_voice(
244
+ voice_model: str,
245
+ voice_path: str,
246
+ output_path: str,
247
+ pitch_change: int,
248
+ f0_method: F0Method,
249
+ index_rate: float,
250
+ filter_radius: int,
251
+ rms_mix_rate: float,
252
+ protect: float,
253
+ crepe_hop_length: int,
254
+ output_sr: int,
255
+ ) -> None:
256
+ """
257
+ Convert a voice track using a voice model.
258
+
259
+ Parameters
260
+ ----------
261
+ voice_model : str
262
+ The name of the voice model to use.
263
+ voice_path : str
264
+ The path to the voice track to convert.
265
+ output_path : str
266
+ The path to save the converted voice to.
267
+ pitch_change : int
268
+ The number of semi-tones to pitch-shift the converted voice by.
269
+ f0_method : F0Method
270
+ The method to use for pitch extraction.
271
+ index_rate : float
272
+ The influence of index file on voice conversion.
273
+ filter_radius : int
274
+ The filter radius to use for the voice conversion.
275
+ rms_mix_rate : float
276
+ The blending rate of the volume envelope of converted voice.
277
+ protect : float
278
+ The protection rate for consonants and breathing sounds.
279
+ crepe_hop_length : int
280
+ The hop length to use for Crepe pitch extraction method.
281
+ output_sr : int
282
+ The sample rate to use for the output audio.
283
+ """
284
+ rvc_model_path, rvc_index_path = get_rvc_model(voice_model)
285
+ device = "cuda:0"
286
+ config = Config(device, True)
287
+ hubert_model = load_hubert(
288
+ device, config.is_half, os.path.join(RVC_MODELS_DIR, "hubert_base.pt")
289
+ )
290
+ cpt, version, net_g, tgt_sr, vc = get_vc(
291
+ device, config.is_half, config, rvc_model_path
292
+ )
293
+
294
+ # convert main vocals
295
+ rvc_infer(
296
+ rvc_index_path,
297
+ index_rate,
298
+ voice_path,
299
+ output_path,
300
+ pitch_change,
301
+ f0_method,
302
+ cpt,
303
+ version,
304
+ net_g,
305
+ filter_radius,
306
+ tgt_sr,
307
+ rms_mix_rate,
308
+ protect,
309
+ crepe_hop_length,
310
+ vc,
311
+ hubert_model,
312
+ output_sr,
313
+ )
314
+ del hubert_model, cpt
315
+ gc.collect()
316
+
317
+
318
+ def _add_audio_effects(
319
+ audio_path: str,
320
+ output_path: str,
321
+ reverb_rm_size: float,
322
+ reverb_wet: float,
323
+ reverb_dry: float,
324
+ reverb_damping: float,
325
+ ) -> None:
326
+ """
327
+ Add high-pass filter, compressor and reverb effects to an audio file.
328
+
329
+ Parameters
330
+ ----------
331
+ audio_path : str
332
+ The path of the audio file to add effects to.
333
+ output_path : str
334
+ The path to save the effected audio file to.
335
+ reverb_rm_size : float
336
+ The room size of the reverb effect.
337
+ reverb_wet : float
338
+ The wet level of the reverb effect.
339
+ reverb_dry : float
340
+ The dry level of the reverb effect.
341
+ reverb_damping : float
342
+ The damping of the reverb effect.
343
+ """
344
+ board = Pedalboard(
345
+ [
346
+ HighpassFilter(),
347
+ Compressor(ratio=4, threshold_db=-15),
348
+ Reverb(
349
+ room_size=reverb_rm_size,
350
+ dry_level=reverb_dry,
351
+ wet_level=reverb_wet,
352
+ damping=reverb_damping,
353
+ ),
354
+ ]
355
+ )
356
+
357
+ with AudioFile(audio_path) as f:
358
+ with AudioFile(output_path, "w", f.samplerate, f.num_channels) as o:
359
+ # Read one second of audio at a time, until the file is empty:
360
+ while f.tell() < f.frames:
361
+ chunk = f.read(int(f.samplerate))
362
+ effected = board(chunk, f.samplerate, reset=False)
363
+ o.write(effected)
364
+
365
+
366
+ def _map_audio_ext(input_audio_ext: InputAudioExt) -> OutputAudioExt:
367
+ """
368
+ Map an input audio extension to an output audio extension.
369
+
370
+ Parameters
371
+ ----------
372
+ input_audio_ext : InputAudioExt
373
+ The input audio extension.
374
+
375
+ Returns
376
+ -------
377
+ OutputAudioExt
378
+ The output audio extension.
379
+ """
380
+ match input_audio_ext:
381
+ case "m4a":
382
+ return "ipod"
383
+ case "aac":
384
+ return "adts"
385
+ case _:
386
+ return input_audio_ext
387
+
388
+
389
+ def _mix_audio(
390
+ main_vocal_path: str,
391
+ backup_vocal_path: str,
392
+ instrumental_path: str,
393
+ main_gain: int,
394
+ backup_gain: int,
395
+ inst_gain: int,
396
+ output_format: InputAudioExt,
397
+ output_sr: int,
398
+ output_path: str,
399
+ ) -> None:
400
+ """
401
+ Mix main vocals, backup vocals and instrumentals.
402
+
403
+ Parameters
404
+ ----------
405
+ main_vocal_path : str
406
+ The path of an audio file containing main vocals.
407
+ backup_vocal_path : str
408
+ The path of an audio file containing backup vocals.
409
+ instrumental_path : str
410
+ The path of an audio file containing instrumentals.
411
+ main_gain : int
412
+ The gain to apply to the main vocals.
413
+ backup_gain : int
414
+ The gain to apply to the backup vocals.
415
+ inst_gain : int
416
+ The gain to apply to the instrumental.
417
+ output_format : InputAudioExt
418
+ The format to save the mixed audio file in.
419
+ output_sr : int
420
+ The sample rate to use for the mixed audio file.
421
+ output_path : str
422
+ The path to save the mixed audio file to.
423
+ """
424
+ main_vocal_audio = AudioSegment.from_wav(main_vocal_path) + main_gain
425
+ backup_vocal_audio = AudioSegment.from_wav(backup_vocal_path) + backup_gain
426
+ instrumental_audio = AudioSegment.from_wav(instrumental_path) + inst_gain
427
+ combined_audio = main_vocal_audio.overlay(backup_vocal_audio).overlay(
428
+ instrumental_audio
429
+ )
430
+ combined_audio_resampled = combined_audio.set_frame_rate(output_sr)
431
+ mapped_output_format = _map_audio_ext(output_format)
432
+ combined_audio_resampled.export(output_path, format=mapped_output_format)
433
+
434
+
435
+ def get_named_song_dirs() -> list[tuple[str, str]]:
436
+ """
437
+ Get the names and paths of all song directories.
438
+
439
+ Returns
440
+ -------
441
+ list[tuple[str, str]]
442
+ A list of tuples containing the name and path of each song directory.
443
+ """
444
+ input_paths = _get_input_audio_paths()
445
+ named_song_dirs: list[tuple[str, str]] = []
446
+
447
+ for path in input_paths:
448
+ song_dir, song_basename = os.path.split(path)
449
+ song_name = (
450
+ os.path.splitext(song_basename)[0]
451
+ .removeprefix("0_")
452
+ .removesuffix("_Original")
453
+ )
454
+ named_song_dirs.append((song_name, song_dir))
455
+ return sorted(named_song_dirs, key=lambda x: x[0])
456
+
457
+
458
+ def convert_to_stereo(
459
+ song_path: str,
460
+ song_dir: str,
461
+ progress_bar: gr.Progress | None = None,
462
+ percentage: float = 0.0,
463
+ ) -> str:
464
+ """
465
+ Converts an audio file to stereo.
466
+
467
+ Parameters
468
+ ----------
469
+ song_path : str
470
+ The path to the audio file to convert.
471
+ song_dir : str
472
+ The path to the directory where the stereo audio file will be saved.
473
+ progress_bar : gr.Progress, optional
474
+ Gradio progress bar to update.
475
+ percentage : float, default=0.0
476
+ Percentage to display in the progress bar.
477
+
478
+ Returns
479
+ -------
480
+ str
481
+ The path to the stereo audio file.
482
+
483
+ Raises
484
+ ------
485
+ InputMissingError
486
+ If no audio file or song directory path is provided.
487
+ PathNotFoundError
488
+ If the provided audio file or song directory path does not point
489
+ to an existing file or directory.
490
+ """
491
+ if not song_path:
492
+ raise InputMissingError("Input song missing!")
493
+ if not os.path.isfile(song_path):
494
+ raise PathNotFoundError("Input song does not exist!")
495
+ if not song_dir:
496
+ raise InputMissingError("Song directory missing!")
497
+ if not os.path.isdir(song_dir):
498
+ raise PathNotFoundError("Song directory does not exist!")
499
+
500
+ stereo_path = song_path
501
+
502
+ song_info = pydub_utils.mediainfo(song_path)
503
+ if song_info["channels"] == "1":
504
+ arg_dict = {
505
+ "input-files": [
506
+ {"name": os.path.basename(song_path), "hash": get_file_hash(song_path)}
507
+ ],
508
+ }
509
+ stereo_path_base = _get_unique_base_path(
510
+ song_dir, "0_Stereo", arg_dict, progress_bar, percentage
511
+ )
512
+ stereo_path = f"{stereo_path_base}.wav"
513
+ stereo_json_path = f"{stereo_path_base}.json"
514
+ if not (os.path.exists(stereo_path) and os.path.exists(stereo_json_path)):
515
+ display_progress(
516
+ "[~] Converting song to stereo...", percentage, progress_bar
517
+ )
518
+ command = shlex.split(
519
+ f'ffmpeg -y -loglevel error -i "{song_path}" -ac 2 -f wav'
520
+ f' "{stereo_path}"'
521
+ )
522
+ subprocess.run(command)
523
+ json_dump(arg_dict, stereo_json_path)
524
+
525
+ return stereo_path
526
+
527
+
528
+ def _make_song_dir(
529
+ song_input: str, progress_bar: gr.Progress | None = None, percentage: float = 0.0
530
+ ) -> tuple[str, InputType]:
531
+ """
532
+ Create a song directory for a given song input.
533
+
534
+ * If the song input is a YouTube URL,
535
+ the song directory will be named after the video id.
536
+ * If the song input is a local audio file,
537
+ the song directory will be named after the file hash.
538
+ * if the song input is a song directory,
539
+ the song directory will be used as is.
540
+
541
+ Parameters
542
+ ----------
543
+ song_input : str
544
+ The song input to create a directory for.
545
+ progress_bar : gr.Progress, optional
546
+ Gradio progress bar to update.
547
+ percentage : float, default=0.0
548
+ Percentage to display in the progress bar.
549
+
550
+ Returns
551
+ -------
552
+ song_dir : str
553
+ The path to the created song directory.
554
+ input_type : InputType
555
+ The type of input provided.
556
+
557
+ Raises
558
+ ------
559
+ InputMissingError
560
+ If no song input is provided.
561
+ InvalidPathError
562
+ If the provided YouTube URL is invalid or if the provided song directory
563
+ is not located in the root of the intermediate audio directory.
564
+ PathNotFoundError
565
+ If the provided song input is neither a valid HTTPS-based URL
566
+ nor the path of an existing song directory or audio file.
567
+ """
568
+ # if song directory
569
+ if os.path.isdir(song_input):
570
+ if not PurePath(song_input).parent == PurePath(INTERMEDIATE_AUDIO_DIR):
571
+ raise InvalidPathError(
572
+ "Song directory not located in the root of the intermediate audio"
573
+ " directory."
574
+ )
575
+ display_progress(
576
+ "[~] Using existing song directory...", percentage, progress_bar
577
+ )
578
+ input_type = "local"
579
+ return song_input, input_type
580
+
581
+ display_progress("[~] Creating song directory...", percentage, progress_bar)
582
+ # if youtube url
583
+ if urlparse(song_input).scheme == "https":
584
+ input_type = "yt"
585
+ song_id = _get_youtube_video_id(song_input)
586
+ if song_id is None:
587
+ raise InvalidPathError("Invalid YouTube url!")
588
+ # if local audio file
589
+ elif os.path.isfile(song_input):
590
+ input_type = "local"
591
+ song_id = get_file_hash(song_input)
592
+ else:
593
+ raise PathNotFoundError(f"Song input {song_input} does not exist.")
594
+
595
+ song_dir = os.path.join(INTERMEDIATE_AUDIO_DIR, song_id)
596
+
597
+ Path(song_dir).mkdir(parents=True, exist_ok=True)
598
+
599
+ return song_dir, input_type
600
+
601
+
602
+ def retrieve_song(
603
+ song_input: str,
604
+ progress_bar: gr.Progress | None = None,
605
+ percentages: tuple[float, float, float] = (0, 0.33, 0.67),
606
+ ) -> tuple[str, str]:
607
+ """
608
+ Retrieve a song from a YouTube URL, local audio file or a song directory.
609
+
610
+ Parameters
611
+ ----------
612
+ song_input : str
613
+ A Youtube URL, the path of a local audio file
614
+ or the path of a song directory.
615
+ progress_bar : gr.Progress, optional
616
+ Gradio progress bar to update.
617
+ percentages : tuple[float,float,float], default=(0, 0.33, 0.67)
618
+ Percentages to display in the progress bar.
619
+
620
+ Returns
621
+ -------
622
+ song_path : str
623
+ The path to the retrieved audio file
624
+ song_dir : str
625
+ The path to the song directory containing it.
626
+
627
+ Raises
628
+ ------
629
+ InputMissingError
630
+ If no song input is provided.
631
+ InvalidPathError
632
+ If the provided Youtube URL is invalid or if the provided song directory
633
+ is not located in the root of the intermediate audio directory.
634
+ PathNotFoundError
635
+ If the provided song input is neither a valid HTTPS-based URL
636
+ nor the path of an existing song directory or audio file.
637
+ """
638
+ if not song_input:
639
+ raise InputMissingError(
640
+ "Song input missing! Please provide a valid YouTube url, local audio file"
641
+ " path or cached song directory path."
642
+ )
643
+
644
+ song_dir, input_type = _make_song_dir(song_input, progress_bar, percentages[0])
645
+ orig_song_path = _get_input_audio_path(song_dir)
646
+
647
+ if not orig_song_path:
648
+ if input_type == "yt":
649
+ display_progress("[~] Downloading song...", percentages[1], progress_bar)
650
+ song_link = song_input.split("&")[0]
651
+ orig_song_path = _yt_download(song_link, song_dir)
652
+ else:
653
+ display_progress("[~] Copying song...", percentages[1], progress_bar)
654
+ song_input_base = os.path.basename(song_input)
655
+ song_input_name, song_input_ext = os.path.splitext(song_input_base)
656
+ orig_song_name = f"0_{song_input_name}_Original"
657
+ orig_song_path = os.path.join(song_dir, orig_song_name + song_input_ext)
658
+ shutil.copyfile(song_input, orig_song_path)
659
+
660
+ stereo_path = convert_to_stereo(
661
+ orig_song_path, song_dir, progress_bar, percentages[2]
662
+ )
663
+ return stereo_path, song_dir
664
+
665
+
666
+ def separate_vocals(
667
+ song_path: str,
668
+ song_dir: str,
669
+ stereofy: bool = True,
670
+ progress_bar: gr.Progress | None = None,
671
+ percentages: tuple[float, float] = (0.0, 0.5),
672
+ ) -> tuple[str, str]:
673
+ """
674
+ Separate a song into vocals and instrumentals.
675
+
676
+ Parameters
677
+ ----------
678
+ song_path : str
679
+ The path to the song to separate.
680
+ song_dir : str
681
+ The path to the song directory where the
682
+ separated vocals and instrumentals will be saved.
683
+ stereofy : bool, default=True
684
+ Whether to convert the song to stereo
685
+ before separating its vocals and instrumentals.
686
+ progress_bar : gr.Progress, optional
687
+ Gradio progress bar to update.
688
+ percentages : tuple[float,float], default=(0.0, 0.5)
689
+ Percentages to display in the progress bar.
690
+
691
+ Returns
692
+ -------
693
+ vocals_path : str
694
+ The path to the separated vocals.
695
+ instrumentals_path : str
696
+ The path to the separated instrumentals.
697
+
698
+ Raises
699
+ ------
700
+ InputMissingError
701
+ If no song path or song directory path is provided.
702
+ PathNotFoundError
703
+ If the provided song path or song directory path does not point
704
+ to an existing file or directory.
705
+ """
706
+ if not song_path:
707
+ raise InputMissingError("Input song missing!")
708
+ if not os.path.isfile(song_path):
709
+ raise PathNotFoundError("Input song does not exist!")
710
+ if not song_dir:
711
+ raise InputMissingError("Song directory missing!")
712
+ if not os.path.isdir(song_dir):
713
+ raise PathNotFoundError("Song directory does not exist!")
714
+
715
+ song_path = (
716
+ convert_to_stereo(song_path, song_dir, progress_bar, percentages[0])
717
+ if stereofy
718
+ else song_path
719
+ )
720
+
721
+ arg_dict = {
722
+ "input-files": [
723
+ {"name": os.path.basename(song_path), "hash": get_file_hash(song_path)}
724
+ ],
725
+ }
726
+
727
+ vocals_path_base = _get_unique_base_path(
728
+ song_dir, "1_Vocals", arg_dict, progress_bar, percentages[1]
729
+ )
730
+
731
+ instrumentals_path_base = _get_unique_base_path(
732
+ song_dir, "1_Instrumental", arg_dict, progress_bar, percentages[1]
733
+ )
734
+
735
+ vocals_path = f"{vocals_path_base}.wav"
736
+ vocals_json_path = f"{vocals_path_base}.json"
737
+ instrumentals_path = f"{instrumentals_path_base}.wav"
738
+ instrumentals_json_path = f"{instrumentals_path_base}.json"
739
+
740
+ if not (
741
+ os.path.exists(vocals_path)
742
+ and os.path.exists(vocals_json_path)
743
+ and os.path.exists(instrumentals_path)
744
+ and os.path.exists(instrumentals_json_path)
745
+ ):
746
+ display_progress(
747
+ "[~] Separating vocals from instrumentals...", percentages[1], progress_bar
748
+ )
749
+ SEPARATOR.arch_specific_params["MDX"]["segment_size"] = 512
750
+ SEPARATOR.load_model("UVR-MDX-NET-Voc_FT.onnx")
751
+ temp_instrumentals_name, temp_vocals_name = SEPARATOR.separate(song_path)
752
+ shutil.move(
753
+ os.path.join(INTERMEDIATE_AUDIO_DIR, temp_instrumentals_name),
754
+ instrumentals_path,
755
+ )
756
+ shutil.move(os.path.join(INTERMEDIATE_AUDIO_DIR, temp_vocals_name), vocals_path)
757
+ json_dump(arg_dict, vocals_json_path)
758
+ json_dump(arg_dict, instrumentals_json_path)
759
+ return vocals_path, instrumentals_path
760
+
761
+
762
+ def separate_main_vocals(
763
+ vocals_path: str,
764
+ song_dir: str,
765
+ stereofy: bool = True,
766
+ progress_bar: gr.Progress | None = None,
767
+ percentages: tuple[float, float] = (0.0, 0.5),
768
+ ) -> tuple[str, str]:
769
+ """
770
+ Separate a vocals track into main vocals and backup vocals.
771
+
772
+ Parameters
773
+ ----------
774
+ vocals_path : str
775
+ The path to the vocals track to separate.
776
+ song_dir : str
777
+ The path to the directory where the separated main vocals
778
+ and backup vocals will be saved.
779
+ stereofy : bool, default=True
780
+ Whether to convert the vocals track to stereo
781
+ before separating its main vocals and backup vocals.
782
+ progress_bar : gr.Progress, optional
783
+ Gradio progress bar to update.
784
+ percentages : tuple[float,float], default=(0.0, 0.5)
785
+ Percentages to display in the progress bar.
786
+
787
+ Returns
788
+ -------
789
+ main_vocals_path : str
790
+ The path to the separated main vocals.
791
+ backup_vocals_path : str
792
+ The path to the separated backup vocals.
793
+
794
+ Raises
795
+ ------
796
+ InputMissingError
797
+ If no vocals track path or song directory path is provided.
798
+ PathNotFoundError
799
+ If the provided vocals path or song directory path does not point
800
+ to an existing file or directory.
801
+ """
802
+ if not vocals_path:
803
+ raise InputMissingError("Vocals missing!")
804
+ if not os.path.isfile(vocals_path):
805
+ raise PathNotFoundError("Vocals do not exist!")
806
+ if not song_dir:
807
+ raise InputMissingError("Song directory missing!")
808
+ if not os.path.isdir(song_dir):
809
+ raise PathNotFoundError("song directory does not exist!")
810
+
811
+ vocals_path = (
812
+ convert_to_stereo(vocals_path, song_dir, progress_bar, percentages[0])
813
+ if stereofy
814
+ else vocals_path
815
+ )
816
+
817
+ arg_dict = {
818
+ "input-files": [
819
+ {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
820
+ ],
821
+ }
822
+
823
+ main_vocals_path_base = _get_unique_base_path(
824
+ song_dir, "2_Vocals_Main", arg_dict, progress_bar, percentages[1]
825
+ )
826
+
827
+ backup_vocals_path_base = _get_unique_base_path(
828
+ song_dir, "2_Vocals_Backup", arg_dict, progress_bar, percentages[1]
829
+ )
830
+
831
+ main_vocals_path = f"{main_vocals_path_base}.wav"
832
+ main_vocals_json_path = f"{main_vocals_path_base}.json"
833
+ backup_vocals_path = f"{backup_vocals_path_base}.wav"
834
+ backup_vocals_json_path = f"{backup_vocals_path_base}.json"
835
+
836
+ if not (
837
+ os.path.exists(main_vocals_path)
838
+ and os.path.exists(main_vocals_json_path)
839
+ and os.path.exists(backup_vocals_path)
840
+ and os.path.exists(backup_vocals_json_path)
841
+ ):
842
+ display_progress(
843
+ "[~] Separating main vocals from backup vocals...",
844
+ percentages[1],
845
+ progress_bar,
846
+ )
847
+ SEPARATOR.arch_specific_params["MDX"]["segment_size"] = 512
848
+ SEPARATOR.load_model("UVR_MDXNET_KARA_2.onnx")
849
+ temp_main_vocals_name, temp_backup_vocals_name = SEPARATOR.separate(vocals_path)
850
+ shutil.move(
851
+ os.path.join(INTERMEDIATE_AUDIO_DIR, temp_main_vocals_name),
852
+ main_vocals_path,
853
+ )
854
+ shutil.move(
855
+ os.path.join(INTERMEDIATE_AUDIO_DIR, temp_backup_vocals_name),
856
+ backup_vocals_path,
857
+ )
858
+ json_dump(arg_dict, main_vocals_json_path)
859
+ json_dump(arg_dict, backup_vocals_json_path)
860
+ return main_vocals_path, backup_vocals_path
861
+
862
+
863
+ def dereverb_vocals(
864
+ vocals_path: str,
865
+ song_dir: str,
866
+ stereofy: bool = True,
867
+ progress_bar: gr.Progress | None = None,
868
+ percentages: tuple[float, float] = (0.0, 0.5),
869
+ ) -> tuple[str, str]:
870
+ """
871
+ De-reverb a vocals track.
872
+
873
+ Parameters
874
+ ----------
875
+ vocals_path : str
876
+ The path to the vocals track to de-reverb.
877
+ song_dir : str
878
+ The path to the directory where the de-reverbed vocals will be saved.
879
+ stereofy : bool, default=True
880
+ Whether to convert the vocals track to stereo before de-reverbing it.
881
+ progress_bar : gr.Progress, optional
882
+ Gradio progress bar to update.
883
+ percentages : tuple[float,float], default=(0.0, 0.5)
884
+ Percentages to display in the progress bar.
885
+
886
+ Returns
887
+ -------
888
+ vocals_dereverb_path : str
889
+ The path to the de-reverbed vocals.
890
+ vocals_reverb_path : str
891
+ The path to the reverb of the vocals.
892
+
893
+ Raises
894
+ ------
895
+ InputMissingError
896
+ If no vocals track path or song directory path is provided.
897
+ PathNotFoundError
898
+ If the provided vocals path or song directory path does not point
899
+ to an existing file or directory.
900
+ """
901
+ if not vocals_path:
902
+ raise InputMissingError("Vocals missing!")
903
+ if not os.path.isfile(vocals_path):
904
+ raise PathNotFoundError("Vocals do not exist!")
905
+ if not song_dir:
906
+ raise InputMissingError("Song directory missing!")
907
+ if not os.path.isdir(song_dir):
908
+ raise PathNotFoundError("song directory does not exist!")
909
+
910
+ vocals_path = (
911
+ convert_to_stereo(vocals_path, song_dir, progress_bar, percentages[0])
912
+ if stereofy
913
+ else vocals_path
914
+ )
915
+
916
+ arg_dict = {
917
+ "input-files": [
918
+ {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
919
+ ],
920
+ }
921
+
922
+ vocals_dereverb_path_base = _get_unique_base_path(
923
+ song_dir, "3_Vocals_DeReverb", arg_dict, progress_bar, percentages[1]
924
+ )
925
+ vocals_reverb_path_base = _get_unique_base_path(
926
+ song_dir, "3_Vocals_Reverb", arg_dict, progress_bar, percentages[1]
927
+ )
928
+
929
+ vocals_dereverb_path = f"{vocals_dereverb_path_base}.wav"
930
+ vocals_dereverb_json_path = f"{vocals_dereverb_path_base}.json"
931
+
932
+ vocals_reverb_path = f"{vocals_reverb_path_base}.wav"
933
+ vocals_reverb_json_path = f"{vocals_reverb_path_base}.json"
934
+
935
+ if not (
936
+ os.path.exists(vocals_dereverb_path)
937
+ and os.path.exists(vocals_dereverb_json_path)
938
+ and os.path.exists(vocals_reverb_path)
939
+ and os.path.exists(vocals_reverb_json_path)
940
+ ):
941
+ display_progress("[~] De-reverbing vocals...", percentages[1], progress_bar)
942
+ SEPARATOR.arch_specific_params["MDX"]["segment_size"] = 256
943
+ SEPARATOR.load_model("Reverb_HQ_By_FoxJoy.onnx")
944
+ temp_vocals_dereverb_name, temp_vocals_reverb_name = SEPARATOR.separate(
945
+ vocals_path
946
+ )
947
+ shutil.move(
948
+ os.path.join(INTERMEDIATE_AUDIO_DIR, temp_vocals_dereverb_name),
949
+ vocals_dereverb_path,
950
+ )
951
+ shutil.move(
952
+ os.path.join(INTERMEDIATE_AUDIO_DIR, temp_vocals_reverb_name),
953
+ vocals_reverb_path,
954
+ )
955
+ json_dump(arg_dict, vocals_dereverb_json_path)
956
+ json_dump(arg_dict, vocals_reverb_json_path)
957
+ return vocals_dereverb_path, vocals_reverb_path
958
+
959
+
960
+ def convert_vocals(
961
+ vocals_path: str,
962
+ song_dir: str,
963
+ voice_model: str,
964
+ pitch_change_octaves: int = 0,
965
+ pitch_change_semi_tones: int = 0,
966
+ index_rate: float = 0.5,
967
+ filter_radius: int = 3,
968
+ rms_mix_rate: float = 0.25,
969
+ protect: float = 0.33,
970
+ f0_method: F0Method = "rmvpe",
971
+ crepe_hop_length: int = 128,
972
+ progress_bar: gr.Progress | None = None,
973
+ percentage: float = 0.0,
974
+ ) -> str:
975
+ """
976
+ Convert a vocals track using a voice model.
977
+
978
+ Parameters
979
+ ----------
980
+ vocals_path : str
981
+ The path to the vocals track to convert.
982
+ song_dir : str
983
+ The path to the directory where the converted vocals will be saved.
984
+ voice_model : str
985
+ The name of the voice model to use.
986
+ pitch_change_octaves : int, default=0
987
+ The number of octaves to pitch-shift the converted vocals by.
988
+ pitch_change_semi_tones : int, default=0
989
+ The number of semi-tones to pitch-shift the converted vocals by.
990
+ index_rate : float, default=0.5
991
+ The influence of the index file on the vocal conversion.
992
+ filter_radius : int, default=3
993
+ The filter radius to use for the vocal conversion.
994
+ rms_mix_rate : float, default=0.25
995
+ The blending rate of the volume envelope of the converted vocals.
996
+ protect : float, default=0.33
997
+ The protection rate for consonants and breathing sounds.
998
+ f0_method : F0Method, default="rmvpe"
999
+ The method to use for pitch extraction.
1000
+ crepe_hop_length : int, default=128
1001
+ The hop length to use for crepe-based pitch extraction.
1002
+ progress_bar : gr.Progress, optional
1003
+ Gradio progress bar to update.
1004
+ percentage : float, default=0.0
1005
+ Percentage to display in the progress bar.
1006
+
1007
+ Returns
1008
+ -------
1009
+ str
1010
+ The path to the converted vocals.
1011
+
1012
+ Raises
1013
+ ------
1014
+ InputMissingError
1015
+ If no vocals track path, song directory path or voice model name is provided.
1016
+ PathNotFoundError
1017
+ If the provided vocals path, song directory path or voice model name
1018
+ does not point to an existing file or directory.
1019
+ """
1020
+ if not vocals_path:
1021
+ raise InputMissingError("Vocals missing!")
1022
+ if not os.path.isfile(vocals_path):
1023
+ raise PathNotFoundError("Vocals do not exist!")
1024
+ if not song_dir:
1025
+ raise InputMissingError("Song directory missing!")
1026
+ if not os.path.isdir(song_dir):
1027
+ raise PathNotFoundError("song directory does not exist!")
1028
+ if not voice_model:
1029
+ raise InputMissingError("Voice model missing!")
1030
+ if not os.path.isdir(os.path.join(RVC_MODELS_DIR, voice_model)):
1031
+ raise PathNotFoundError("Voice model does not exist!")
1032
+
1033
+ pitch_change = pitch_change_octaves * 12 + pitch_change_semi_tones
1034
+ hop_length_suffix = "" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"
1035
+ arg_dict = {
1036
+ "input-files": [
1037
+ {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
1038
+ ],
1039
+ "voice-model": voice_model,
1040
+ "pitch-shift": pitch_change,
1041
+ "index-rate": index_rate,
1042
+ "filter-radius": filter_radius,
1043
+ "rms-mix-rate": rms_mix_rate,
1044
+ "protect": protect,
1045
+ "f0-method": f"{f0_method}{hop_length_suffix}",
1046
+ }
1047
+
1048
+ converted_vocals_path_base = _get_unique_base_path(
1049
+ song_dir, "4_Vocals_Converted", arg_dict, progress_bar, percentage
1050
+ )
1051
+ converted_vocals_path = f"{converted_vocals_path_base}.wav"
1052
+ converted_vocals_json_path = f"{converted_vocals_path_base}.json"
1053
+
1054
+ if not (
1055
+ os.path.exists(converted_vocals_path)
1056
+ and os.path.exists(converted_vocals_json_path)
1057
+ ):
1058
+ display_progress("[~] Converting vocals using RVC...", percentage, progress_bar)
1059
+ _convert_voice(
1060
+ voice_model,
1061
+ vocals_path,
1062
+ converted_vocals_path,
1063
+ pitch_change,
1064
+ f0_method,
1065
+ index_rate,
1066
+ filter_radius,
1067
+ rms_mix_rate,
1068
+ protect,
1069
+ crepe_hop_length,
1070
+ 44100,
1071
+ )
1072
+ json_dump(arg_dict, converted_vocals_json_path)
1073
+ return converted_vocals_path
1074
+
1075
+
1076
+ def postprocess_vocals(
1077
+ vocals_path: str,
1078
+ song_dir: str,
1079
+ reverb_rm_size: float = 0.15,
1080
+ reverb_wet: float = 0.2,
1081
+ reverb_dry: float = 0.8,
1082
+ reverb_damping: float = 0.7,
1083
+ progress_bar: gr.Progress | None = None,
1084
+ percentage: float = 0.0,
1085
+ ) -> str:
1086
+ """
1087
+ Apply high-pass filter, compressor and reverb effects to a vocals track.
1088
+
1089
+ Parameters
1090
+ ----------
1091
+ vocals_path : str
1092
+ The path to the vocals track to add effects to.
1093
+ song_dir : str
1094
+ The path to the directory where the effected vocals will be saved.
1095
+ reverb_rm_size : float, default=0.15
1096
+ The room size of the reverb effect.
1097
+ reverb_wet : float, default=0.2
1098
+ The wet level of the reverb effect.
1099
+ reverb_dry : float, default=0.8
1100
+ The dry level of the reverb effect.
1101
+ reverb_damping : float, default=0.7
1102
+ The damping of the reverb effect.
1103
+ progress_bar : gr.Progress, optional
1104
+ Gradio progress bar to update.
1105
+ percentage : float, default=0.0
1106
+ Percentage to display in the progress bar.
1107
+
1108
+ Returns
1109
+ -------
1110
+ str
1111
+ The path to the effected vocals.
1112
+
1113
+ Raises
1114
+ ------
1115
+ InputMissingError
1116
+ If no vocals track path or song directory path is provided.
1117
+ PathNotFoundError
1118
+ If the provided vocals path or song directory path does not point
1119
+ to an existing file or directory.
1120
+ """
1121
+ if not vocals_path:
1122
+ raise InputMissingError("Vocals missing!")
1123
+ if not os.path.isfile(vocals_path):
1124
+ raise PathNotFoundError("Vocals do not exist!")
1125
+ if not song_dir:
1126
+ raise InputMissingError("Song directory missing!")
1127
+ if not os.path.isdir(song_dir):
1128
+ raise PathNotFoundError("song directory does not exist!")
1129
+
1130
+ arg_dict = {
1131
+ "input-files": [
1132
+ {"name": os.path.basename(vocals_path), "hash": get_file_hash(vocals_path)}
1133
+ ],
1134
+ "reverb-room-size": reverb_rm_size,
1135
+ "reverb-wet": reverb_wet,
1136
+ "reverb-dry": reverb_dry,
1137
+ "reverb-damping": reverb_damping,
1138
+ }
1139
+
1140
+ vocals_mixed_path_base = _get_unique_base_path(
1141
+ song_dir, "5_Vocals_Postprocessed", arg_dict, progress_bar, percentage
1142
+ )
1143
+
1144
+ vocals_mixed_path = f"{vocals_mixed_path_base}.wav"
1145
+ vocals_mixed_json_path = f"{vocals_mixed_path_base}.json"
1146
+
1147
+ if not (
1148
+ os.path.exists(vocals_mixed_path) and os.path.exists(vocals_mixed_json_path)
1149
+ ):
1150
+ display_progress(
1151
+ "[~] Applying audio effects to vocals...", percentage, progress_bar
1152
+ )
1153
+ _add_audio_effects(
1154
+ vocals_path,
1155
+ vocals_mixed_path,
1156
+ reverb_rm_size,
1157
+ reverb_wet,
1158
+ reverb_dry,
1159
+ reverb_damping,
1160
+ )
1161
+ json_dump(arg_dict, vocals_mixed_json_path)
1162
+ return vocals_mixed_path
1163
+
1164
+
1165
+ def pitch_shift_background(
1166
+ instrumentals_path: str,
1167
+ backup_vocals_path: str,
1168
+ song_dir: str,
1169
+ pitch_change: int = 0,
1170
+ progress_bar: gr.Progress | None = None,
1171
+ percentages: tuple[float, float] = (0.0, 0.5),
1172
+ ) -> tuple[str, str]:
1173
+ """
1174
+ Pitch shift instrumentals and backup vocals by a given number of semi-tones.
1175
+
1176
+ Parameters
1177
+ ----------
1178
+ instrumentals_path : str
1179
+ The path to the instrumentals to pitch shift.
1180
+ backup_vocals_path : str
1181
+ The path to the backup vocals to pitch shift.
1182
+ song_dir : str
1183
+ The path to the directory where the pitch-shifted instrumentals
1184
+ and backup vocals will be saved.
1185
+ pitch_change : int, default=0
1186
+ The number of semi-tones to pitch-shift the instrumentals
1187
+ and backup vocals by.
1188
+ progress_bar : gr.Progress, optional
1189
+ Gradio progress bar to update.
1190
+ percentages : tuple[float,float], default=(0.0, 0.5)
1191
+ Percentages to display in the progress bar.
1192
+
1193
+ Returns
1194
+ -------
1195
+ instrumentals_shifted_path : str
1196
+ The path to the pitch-shifted instrumentals.
1197
+ backup_vocals_shifted_path : str
1198
+ The path to the pitch-shifted backup vocals.
1199
+
1200
+ Raises
1201
+ ------
1202
+ InputMissingError
1203
+ If no instrumentals path, backup vocals path or song directory path is provided.
1204
+ PathNotFoundError
1205
+ If the provided instrumentals path, backup vocals path or song directory path
1206
+ does not point to an existing file or directory.
1207
+ """
1208
+ if not instrumentals_path:
1209
+ raise InputMissingError("Instrumentals missing!")
1210
+ if not os.path.isfile(instrumentals_path):
1211
+ raise PathNotFoundError("Instrumentals do not exist!")
1212
+ if not backup_vocals_path:
1213
+ raise InputMissingError("Backup vocals missing!")
1214
+ if not os.path.isfile(backup_vocals_path):
1215
+ raise PathNotFoundError("Backup vocals do not exist!")
1216
+ if not song_dir:
1217
+ raise InputMissingError("Song directory missing!")
1218
+ if not os.path.isdir(song_dir):
1219
+ raise PathNotFoundError("song directory does not exist!")
1220
+
1221
+ instrumentals_shifted_path = instrumentals_path
1222
+ backup_vocals_shifted_path = backup_vocals_path
1223
+
1224
+ if pitch_change != 0:
1225
+ instrumentals_dict = {
1226
+ "input-files": [
1227
+ {
1228
+ "name": os.path.basename(instrumentals_path),
1229
+ "hash": get_file_hash(instrumentals_path),
1230
+ }
1231
+ ],
1232
+ "pitch-shift": pitch_change,
1233
+ }
1234
+
1235
+ instrumentals_shifted_path_base = _get_unique_base_path(
1236
+ song_dir,
1237
+ "6_Instrumental_Shifted",
1238
+ instrumentals_dict,
1239
+ progress_bar,
1240
+ percentages[0],
1241
+ )
1242
+
1243
+ instrumentals_shifted_path = f"{instrumentals_shifted_path_base}.wav"
1244
+ instrumentals_shifted_json_path = f"{instrumentals_shifted_path_base}.json"
1245
+
1246
+ if not (
1247
+ os.path.exists(instrumentals_shifted_path)
1248
+ and os.path.exists(instrumentals_shifted_json_path)
1249
+ ):
1250
+ display_progress(
1251
+ "[~] Applying pitch shift to instrumentals",
1252
+ percentages[0],
1253
+ progress_bar,
1254
+ )
1255
+ _pitch_shift(instrumentals_path, instrumentals_shifted_path, pitch_change)
1256
+ json_dump(instrumentals_dict, instrumentals_shifted_json_path)
1257
+
1258
+ backup_vocals_dict = {
1259
+ "input-files": [
1260
+ {
1261
+ "name": os.path.basename(backup_vocals_path),
1262
+ "hash": get_file_hash(backup_vocals_path),
1263
+ }
1264
+ ],
1265
+ "pitch-shift": pitch_change,
1266
+ }
1267
+
1268
+ backup_vocals_shifted_path_base = _get_unique_base_path(
1269
+ song_dir,
1270
+ "6_Vocals_Backup_Shifted",
1271
+ backup_vocals_dict,
1272
+ progress_bar,
1273
+ percentages[1],
1274
+ )
1275
+ backup_vocals_shifted_path = f"{backup_vocals_shifted_path_base}.wav"
1276
+ backup_vocals_shifted_json_path = f"{backup_vocals_shifted_path_base}.json"
1277
+ if not (
1278
+ os.path.exists(backup_vocals_shifted_path)
1279
+ and os.path.exists(backup_vocals_shifted_json_path)
1280
+ ):
1281
+ display_progress(
1282
+ "[~] Applying pitch shift to backup vocals",
1283
+ percentages[1],
1284
+ progress_bar,
1285
+ )
1286
+ _pitch_shift(backup_vocals_path, backup_vocals_shifted_path, pitch_change)
1287
+ json_dump(backup_vocals_dict, backup_vocals_shifted_json_path)
1288
+ return instrumentals_shifted_path, backup_vocals_shifted_path
1289
+
1290
+
1291
+ def _get_voice_model(
1292
+ mixed_vocals_path: str | None = None, song_dir: str | None = None
1293
+ ) -> str:
1294
+ """
1295
+ Infer the voice model used for vocal conversion from a
1296
+ mixed vocals file in a given song directory.
1297
+
1298
+ If the voice model cannot be inferred, "Unknown" is returned.
1299
+
1300
+ Parameters
1301
+ ----------
1302
+ mixed_vocals_path : str, optional
1303
+ The path to a mixed vocals file.
1304
+ song_dir : str, optional
1305
+ The path to a song directory.
1306
+
1307
+ Returns
1308
+ -------
1309
+ str
1310
+ The voice model used for vocal conversion.
1311
+ """
1312
+ voice_model = "Unknown"
1313
+ if not (mixed_vocals_path and song_dir):
1314
+ return voice_model
1315
+ mixed_vocals_stem = get_path_stem(mixed_vocals_path)
1316
+ mixed_vocals_json_path = os.path.join(song_dir, f"{mixed_vocals_stem}.json")
1317
+ if not os.path.isfile(mixed_vocals_json_path):
1318
+ return voice_model
1319
+ mixed_vocals_json_dict = json_load(mixed_vocals_json_path)
1320
+ input_files = mixed_vocals_json_dict.get("input-files")
1321
+ input_path = input_files[0].get("name") if input_files else None
1322
+ if not input_path:
1323
+ return voice_model
1324
+ input_stem = get_path_stem(input_path)
1325
+ converted_vocals_json_path = os.path.join(song_dir, f"{input_stem}.json")
1326
+ if not os.path.isfile(converted_vocals_json_path):
1327
+ return voice_model
1328
+ converted_vocals_dict = json_load(converted_vocals_json_path)
1329
+ return converted_vocals_dict.get("voice-model", voice_model)
1330
+
1331
+
1332
+ def get_song_cover_name(
1333
+ mixed_vocals_path: str | None = None,
1334
+ song_dir: str | None = None,
1335
+ voice_model: str | None = None,
1336
+ progress_bar: gr.Progress | None = None,
1337
+ percentage: float = 0.0,
1338
+ ) -> str:
1339
+ """
1340
+ Generates a suitable name for a cover of a song based on that song's
1341
+ original name and the voice model used for vocal conversion.
1342
+
1343
+ If the path of an existing song directory is provided, the original song
1344
+ name is inferred from that directory. If a voice model is not provided but
1345
+ the path of an existing song directory and the path of a mixed
1346
+ vocals file in that directory are provided, then the voice model is
1347
+ inferred from the mixed vocals file.
1348
+
1349
+ Parameters
1350
+ ----------
1351
+ mixed_vocals_path : str, optional
1352
+ The path to a mixed vocals file.
1353
+ song_dir : str, optional
1354
+ The path to a song directory.
1355
+ voice_model : str, optional
1356
+ A voice model name.
1357
+ progress_bar : gr.Progress, optional
1358
+ Gradio progress bar to update.
1359
+ percentage : float, default=0.0
1360
+ Percentage to display in the progress bar.
1361
+
1362
+ Returns
1363
+ -------
1364
+ str
1365
+ The song cover name
1366
+ """
1367
+ display_progress("[~] Getting song cover name...", percentage, progress_bar)
1368
+
1369
+ orig_song_path = _get_input_audio_path(song_dir) if song_dir else None
1370
+ orig_song_name = (
1371
+ (get_path_stem(orig_song_path).removeprefix("0_").removesuffix("_Original"))
1372
+ if orig_song_path
1373
+ else "Unknown"
1374
+ )
1375
+
1376
+ voice_model = voice_model or _get_voice_model(mixed_vocals_path, song_dir)
1377
+
1378
+ return f"{orig_song_name} ({voice_model} Ver)"
1379
+
1380
+
1381
+ def mix_song_cover(
1382
+ main_vocals_path: str,
1383
+ instrumentals_path: str,
1384
+ backup_vocals_path: str,
1385
+ song_dir: str,
1386
+ main_gain: int = 0,
1387
+ inst_gain: int = 0,
1388
+ backup_gain: int = 0,
1389
+ output_sr: int = 44100,
1390
+ output_format: InputAudioExt = "mp3",
1391
+ output_name: str | None = None,
1392
+ progress_bar: gr.Progress | None = None,
1393
+ percentages: tuple[float, float] = (0.0, 0.5),
1394
+ ) -> str:
1395
+ """
1396
+ Mix main vocals, instrumentals, and backup vocals to create a song cover.
1397
+
1398
+ Parameters
1399
+ ----------
1400
+ main_vocals_path : str
1401
+ The path to the main vocals to mix.
1402
+ instrumentals_path : str
1403
+ The path to the instrumentals to mix.
1404
+ backup_vocals_path : str
1405
+ The path to the backup vocals to mix.
1406
+ song_dir : str
1407
+ The path to the song directory where the song cover will be saved.
1408
+ main_gain : int, default=0
1409
+ The gain to apply to the main vocals.
1410
+ inst_gain : int, default=0
1411
+ The gain to apply to the instrumentals.
1412
+ backup_gain : int, default=0
1413
+ The gain to apply to the backup vocals.
1414
+ output_sr : int, default=44100
1415
+ The sample rate of the song cover.
1416
+ output_format : InputAudioExt, default="mp3"
1417
+ The audio format of the song cover.
1418
+ output_name : str, optional
1419
+ The name of the song cover.
1420
+ progress_bar : gr.Progress, optional
1421
+ Gradio progress bar to update.
1422
+ percentages : tuple[float,float], default=(0.0, 0.5)
1423
+ Percentages to display in the progress bar.
1424
+
1425
+ Returns
1426
+ -------
1427
+ str
1428
+ The path to the song cover.
1429
+
1430
+ Raises
1431
+ ------
1432
+ InputMissingError
1433
+ If no main vocals, instrumentals, backup vocals or song directory path is provided.
1434
+ PathNotFoundError
1435
+ If the provided main vocals, instrumentals, backup vocals or song directory path
1436
+ does not point to an existing file or directory.
1437
+ """
1438
+ if not main_vocals_path:
1439
+ raise InputMissingError("Main vocals missing!")
1440
+ if not os.path.isfile(main_vocals_path):
1441
+ raise PathNotFoundError("Main vocals do not exist!")
1442
+ if not instrumentals_path:
1443
+ raise InputMissingError("Instrumentals missing!")
1444
+ if not os.path.isfile(instrumentals_path):
1445
+ raise PathNotFoundError("Instrumentals do not exist!")
1446
+ if not backup_vocals_path:
1447
+ raise InputMissingError("Backup vocals missing!")
1448
+ if not os.path.isfile(backup_vocals_path):
1449
+ raise PathNotFoundError("Backup vocals do not exist!")
1450
+ if not song_dir:
1451
+ raise InputMissingError("Song directory missing!")
1452
+ if not os.path.isdir(song_dir):
1453
+ raise PathNotFoundError("song directory does not exist!")
1454
+
1455
+ arg_dict = {
1456
+ "input-files": [
1457
+ {
1458
+ "name": os.path.basename(main_vocals_path),
1459
+ "hash": get_file_hash(main_vocals_path),
1460
+ },
1461
+ {
1462
+ "name": os.path.basename(instrumentals_path),
1463
+ "hash": get_file_hash(instrumentals_path),
1464
+ },
1465
+ {
1466
+ "name": os.path.basename(backup_vocals_path),
1467
+ "hash": get_file_hash(backup_vocals_path),
1468
+ },
1469
+ ],
1470
+ "main-gain": main_gain,
1471
+ "instrument-gain": inst_gain,
1472
+ "backup-gain": backup_gain,
1473
+ "sample-rate": output_sr,
1474
+ }
1475
+
1476
+ mixdown_path_base = _get_unique_base_path(
1477
+ song_dir, "7_Mixdown", arg_dict, progress_bar, percentages[0]
1478
+ )
1479
+ mixdown_path = f"{mixdown_path_base}.{output_format}"
1480
+ mixdown_json_path = f"{mixdown_path_base}.json"
1481
+
1482
+ if not (os.path.exists(mixdown_path) and os.path.exists(mixdown_json_path)):
1483
+ display_progress(
1484
+ "[~] Mixing main vocals, instrumentals, and backup vocals...",
1485
+ percentages[0],
1486
+ progress_bar,
1487
+ )
1488
+
1489
+ _mix_audio(
1490
+ main_vocals_path,
1491
+ backup_vocals_path,
1492
+ instrumentals_path,
1493
+ main_gain,
1494
+ backup_gain,
1495
+ inst_gain,
1496
+ output_format,
1497
+ output_sr,
1498
+ mixdown_path,
1499
+ )
1500
+ json_dump(arg_dict, mixdown_json_path)
1501
+
1502
+ output_name = output_name or get_song_cover_name(
1503
+ main_vocals_path, song_dir, None, progress_bar, percentages[1]
1504
+ )
1505
+ song_cover_path = os.path.join(OUTPUT_AUDIO_DIR, f"{output_name}.{output_format}")
1506
+ os.makedirs(OUTPUT_AUDIO_DIR, exist_ok=True)
1507
+ shutil.copyfile(mixdown_path, song_cover_path)
1508
+
1509
+ return song_cover_path
1510
+
1511
+
1512
+ def run_pipeline(
1513
+ song_input: str,
1514
+ voice_model: str,
1515
+ pitch_change_vocals: int = 0,
1516
+ pitch_change_all: int = 0,
1517
+ index_rate: float = 0.5,
1518
+ filter_radius: int = 3,
1519
+ rms_mix_rate: float = 0.25,
1520
+ protect: float = 0.33,
1521
+ f0_method: F0Method = "rmvpe",
1522
+ crepe_hop_length: int = 128,
1523
+ reverb_rm_size: float = 0.15,
1524
+ reverb_wet: float = 0.2,
1525
+ reverb_dry: float = 0.8,
1526
+ reverb_damping: float = 0.7,
1527
+ main_gain: int = 0,
1528
+ inst_gain: int = 0,
1529
+ backup_gain: int = 0,
1530
+ output_sr: int = 44100,
1531
+ output_format: InputAudioExt = "mp3",
1532
+ output_name: str | None = None,
1533
+ return_files: bool = False,
1534
+ progress_bar: gr.Progress | None = None,
1535
+ ) -> str | tuple[str, ...]:
1536
+ """
1537
+ Run the song cover generation pipeline.
1538
+
1539
+ Parameters
1540
+ ----------
1541
+ song_input : str
1542
+ A Youtube URL, the path of a local audio file or the path of a song directory.
1543
+ voice_model : str
1544
+ The name of the voice model to use for vocal conversion.
1545
+ pitch_change_vocals : int, default=0
1546
+ The number of octaves to pitch-shift the converted vocals by.
1547
+ pitch_change_all : int, default=0
1548
+ The number of semi-tones to pitch-shift the converted vocals,
1549
+ instrumentals, and backup vocals by.
1550
+ index_rate : float, default=0.5
1551
+ The influence of the index file on the vocal conversion.
1552
+ filter_radius : int, default=3
1553
+ The filter radius to use for the vocal conversion.
1554
+ rms_mix_rate : float, default=0.25
1555
+ The blending rate of the volume envelope of the converted vocals.
1556
+ protect : float, default=0.33
1557
+ The protection rate for consonants and breathing sounds in the vocal conversion.
1558
+ f0_method : F0Method, default="rmvpe"
1559
+ The method to use for pitch extraction in the vocal conversion.
1560
+ crepe_hop_length : int, default=128
1561
+ The hop length to use for crepe-based pitch extraction.
1562
+ reverb_rm_size : float, default=0.15
1563
+ The room size of the reverb effect to apply to the converted vocals.
1564
+ reverb_wet : float, default=0.2
1565
+ The wet level of the reverb effect to apply to the converted vocals.
1566
+ reverb_dry : float, default=0.8
1567
+ The dry level of the reverb effect to apply to the converted vocals.
1568
+ reverb_damping : float, default=0.7
1569
+ The damping of the reverb effect to apply to the converted vocals.
1570
+ main_gain : int, default=0
1571
+ The gain to apply to the post-processed vocals.
1572
+ inst_gain : int, default=0
1573
+ The gain to apply to the pitch-shifted instrumentals.
1574
+ backup_gain : int, default=0
1575
+ The gain to apply to the pitch-shifted backup vocals.
1576
+ output_sr : int, default=44100
1577
+ The sample rate of the song cover.
1578
+ output_format : InputAudioExt, default="mp3"
1579
+ The audio format of the song cover.
1580
+ output_name : str, optional
1581
+ The name of the song cover.
1582
+ return_files : bool, default=False
1583
+ Whether to return the paths of the generated intermediate audio files.
1584
+ progress_bar : gr.Progress, optional
1585
+ Gradio progress bar to update.
1586
+
1587
+ Returns
1588
+ -------
1589
+ str | tuple[str,...]
1590
+ The path to the generated song cover and, if `return_files=True`,
1591
+ also the paths of any generated intermediate audio files.
1592
+ """
1593
+ if not song_input:
1594
+ raise InputMissingError(
1595
+ "Song input missing! Please provide a valid YouTube url, local audio file"
1596
+ " path or cached song directory path."
1597
+ )
1598
+ if not voice_model:
1599
+ raise InputMissingError("Voice model missing!")
1600
+ if not os.path.isdir(os.path.join(RVC_MODELS_DIR, voice_model)):
1601
+ raise PathNotFoundError("Voice model does not exist!")
1602
+ display_progress("[~] Starting song cover generation pipeline...", 0, progress_bar)
1603
+ orig_song_path, song_dir = retrieve_song(
1604
+ song_input, progress_bar, (0 / 15, 1 / 15, 2 / 15)
1605
+ )
1606
+ vocals_path, instrumentals_path = separate_vocals(
1607
+ orig_song_path, song_dir, False, progress_bar, (3 / 15, 4 / 15)
1608
+ )
1609
+ main_vocals_path, backup_vocals_path = separate_main_vocals(
1610
+ vocals_path, song_dir, False, progress_bar, (5 / 15, 6 / 15)
1611
+ )
1612
+ vocals_dereverb_path, reverb_path = dereverb_vocals(
1613
+ main_vocals_path, song_dir, False, progress_bar, (7 / 15, 8 / 15)
1614
+ )
1615
+ converted_vocals_path = convert_vocals(
1616
+ vocals_dereverb_path,
1617
+ song_dir,
1618
+ voice_model,
1619
+ pitch_change_vocals,
1620
+ pitch_change_all,
1621
+ index_rate,
1622
+ filter_radius,
1623
+ rms_mix_rate,
1624
+ protect,
1625
+ f0_method,
1626
+ crepe_hop_length,
1627
+ progress_bar,
1628
+ 9 / 15,
1629
+ )
1630
+ vocals_mixed_path = postprocess_vocals(
1631
+ converted_vocals_path,
1632
+ song_dir,
1633
+ reverb_rm_size,
1634
+ reverb_wet,
1635
+ reverb_dry,
1636
+ reverb_damping,
1637
+ progress_bar,
1638
+ 10 / 15,
1639
+ )
1640
+ instrumentals_shifted_path, backup_vocals_shifted_path = pitch_shift_background(
1641
+ instrumentals_path,
1642
+ backup_vocals_path,
1643
+ song_dir,
1644
+ pitch_change_all,
1645
+ progress_bar,
1646
+ (11 / 15, 12 / 15),
1647
+ )
1648
+
1649
+ song_cover_path = mix_song_cover(
1650
+ vocals_mixed_path,
1651
+ instrumentals_shifted_path or instrumentals_path,
1652
+ backup_vocals_shifted_path or backup_vocals_path,
1653
+ song_dir,
1654
+ main_gain,
1655
+ inst_gain,
1656
+ backup_gain,
1657
+ output_sr,
1658
+ output_format,
1659
+ output_name,
1660
+ progress_bar,
1661
+ (13 / 15, 14 / 15),
1662
+ )
1663
+ if return_files:
1664
+ return (
1665
+ orig_song_path,
1666
+ vocals_path,
1667
+ instrumentals_path,
1668
+ main_vocals_path,
1669
+ backup_vocals_path,
1670
+ vocals_dereverb_path,
1671
+ reverb_path,
1672
+ converted_vocals_path,
1673
+ vocals_mixed_path,
1674
+ instrumentals_shifted_path,
1675
+ backup_vocals_shifted_path,
1676
+ song_cover_path,
1677
+ )
1678
+ else:
1679
+ return song_cover_path
src/backend/manage_audio.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains functions to manage audio files.
3
+ """
4
+
5
+ import os
6
+ import shutil
7
+ from pathlib import PurePath
8
+
9
+ import gradio as gr
10
+
11
+ from backend.common import INTERMEDIATE_AUDIO_DIR, OUTPUT_AUDIO_DIR, display_progress
12
+ from backend.exceptions import InputMissingError, InvalidPathError, PathNotFoundError
13
+
14
+ from common import GRADIO_TEMP_DIR
15
+
16
+
17
+ def get_output_audio() -> list[tuple[str, str]]:
18
+ """
19
+ Get the name and path of all output audio files.
20
+
21
+ Returns
22
+ -------
23
+ list[tuple[str, str]]
24
+ A list of tuples containing the name and path of each output audio file.
25
+ """
26
+ if os.path.isdir(OUTPUT_AUDIO_DIR):
27
+ named_output_files = [
28
+ (file_name, os.path.join(OUTPUT_AUDIO_DIR, file_name))
29
+ for file_name in os.listdir(OUTPUT_AUDIO_DIR)
30
+ ]
31
+ return sorted(named_output_files, key=lambda x: x[0])
32
+ return []
33
+
34
+
35
+ def delete_intermediate_audio(
36
+ song_dirs: list[str],
37
+ progress_bar: gr.Progress | None = None,
38
+ percentage: float = 0.0,
39
+ ) -> str:
40
+ """
41
+ Delete intermediate audio files in provided song directories.
42
+
43
+ Parameters
44
+ ----------
45
+ song_dirs : list[str]
46
+ Paths of song directories to delete intermediate audio files for.
47
+ progress_bar : gr.Progress, optional
48
+ Gradio progress bar to update.
49
+ percentage : float, default=0.0
50
+ Percentage to display in the progress bar.
51
+
52
+ Returns
53
+ -------
54
+ str
55
+ Success message.
56
+
57
+ Raises
58
+ ------
59
+ InputMissingError
60
+ If no song directories are provided.
61
+ PathNotFoundError
62
+ If a song directory does not exist.
63
+ InvalidPathError
64
+ If a song directory is not located in the root of the intermediate audio directory.
65
+ """
66
+ if not song_dirs:
67
+ raise InputMissingError(
68
+ "Song directories missing! Please provide a non-empty list of song"
69
+ " directories."
70
+ )
71
+ display_progress(
72
+ "[~] Deleting intermediate audio files for selected songs...",
73
+ percentage,
74
+ progress_bar,
75
+ )
76
+ for song_dir in song_dirs:
77
+ if not os.path.isdir(song_dir):
78
+ raise PathNotFoundError(f"Song directory '{song_dir}' does not exist.")
79
+
80
+ if not PurePath(song_dir).parent == PurePath(INTERMEDIATE_AUDIO_DIR):
81
+ raise InvalidPathError(
82
+ f"Song directory '{song_dir}' is not located in the root of the"
83
+ " intermediate audio directory."
84
+ )
85
+ shutil.rmtree(song_dir)
86
+ return "[+] Successfully deleted intermediate audio files for selected songs!"
87
+
88
+
89
+ def delete_all_intermediate_audio(
90
+ progress_bar: gr.Progress | None = None, percentage: float = 0.0
91
+ ) -> str:
92
+ """
93
+ Delete all intermediate audio files.
94
+
95
+ Parameters
96
+ ----------
97
+ progress_bar : gr.Progress, optional
98
+ Gradio progress bar to update.
99
+ percentage : float, default=0.0
100
+
101
+ Returns
102
+ -------
103
+ str
104
+ Success message.
105
+ """
106
+ display_progress(
107
+ "[~] Deleting all intermediate audio files...", percentage, progress_bar
108
+ )
109
+ if os.path.isdir(INTERMEDIATE_AUDIO_DIR):
110
+ shutil.rmtree(INTERMEDIATE_AUDIO_DIR)
111
+
112
+ return "[+] All intermediate audio files successfully deleted!"
113
+
114
+
115
+ def delete_output_audio(
116
+ output_audio_files: list[str],
117
+ progress_bar: gr.Progress | None = None,
118
+ percentage: float = 0.0,
119
+ ) -> str:
120
+ """
121
+ Delete selected output audio files.
122
+
123
+ Parameters
124
+ ----------
125
+ output_audio_files : list[str]
126
+ Paths of output audio files to delete.
127
+ progress_bar : gr.Progress, optional
128
+ Gradio progress bar to update.
129
+ percentage : float, default=0.0
130
+ Percentage to display in the progress bar.
131
+
132
+ Returns
133
+ -------
134
+ str
135
+ Success message.
136
+
137
+ Raises
138
+ ------
139
+ InputMissingError
140
+ If no output audio files are provided.
141
+ PathNotFoundError
142
+ If an output audio file does not exist.
143
+ InvalidPathError
144
+ If an output audio file is not located in the root of the output audio directory.
145
+ """
146
+ if not output_audio_files:
147
+ raise InputMissingError(
148
+ "Output audio files missing! Please provide a non-empty list of output"
149
+ " audio files."
150
+ )
151
+ display_progress(
152
+ "[~] Deleting selected output audio files...", percentage, progress_bar
153
+ )
154
+ for output_audio_file in output_audio_files:
155
+ if not os.path.isfile(output_audio_file):
156
+ raise PathNotFoundError(
157
+ f"Output audio file '{output_audio_file}' does not exist."
158
+ )
159
+ if not PurePath(output_audio_file).parent == PurePath(OUTPUT_AUDIO_DIR):
160
+ raise InvalidPathError(
161
+ f"Output audio file '{output_audio_file}' is not located in the root of"
162
+ " the output audio directory."
163
+ )
164
+ os.remove(output_audio_file)
165
+ return "[+] Successfully deleted selected output audio files!"
166
+
167
+
168
+ def delete_all_output_audio(
169
+ progress_bar: gr.Progress | None = None, percentage: float = 0.0
170
+ ) -> str:
171
+ """
172
+ Delete all output audio files.
173
+
174
+ Parameters
175
+ ----------
176
+ progress_bar : gr.Progress, optional
177
+ Gradio progress bar to update.
178
+ percentage : float, default=0.0
179
+ Percentage to display in the progress bar.
180
+
181
+ Returns
182
+ -------
183
+ str
184
+ Success message.
185
+ """
186
+ display_progress("[~] Deleting all output audio files...", percentage, progress_bar)
187
+ if os.path.isdir(OUTPUT_AUDIO_DIR):
188
+ shutil.rmtree(OUTPUT_AUDIO_DIR)
189
+
190
+ return "[+] All output audio files successfully deleted!"
191
+
192
+
193
+ def delete_all_audio(
194
+ progress_bar: gr.Progress | None = None, percentage: float = 0.0
195
+ ) -> str:
196
+ """
197
+ Delete all audio files.
198
+
199
+ Parameters
200
+ ----------
201
+ progress_bar : gr.Progress, optional
202
+ Gradio progress bar to update.
203
+ percentage : float, default=0.0
204
+ Percentage to display in the progress bar.
205
+
206
+ Returns
207
+ -------
208
+ str
209
+ Success message.
210
+ """
211
+ display_progress("[~] Deleting all audio files...", percentage, progress_bar)
212
+ if os.path.isdir(INTERMEDIATE_AUDIO_DIR):
213
+ shutil.rmtree(INTERMEDIATE_AUDIO_DIR)
214
+ if os.path.isdir(OUTPUT_AUDIO_DIR):
215
+ shutil.rmtree(OUTPUT_AUDIO_DIR)
216
+
217
+ return "[+] All audio files successfully deleted!"
218
+
219
+
220
+ def delete_gradio_temp_dir() -> None:
221
+ """
222
+ Delete the directory where Gradio stores temporary files.
223
+ """
224
+ if os.path.isdir(GRADIO_TEMP_DIR):
225
+ shutil.rmtree(GRADIO_TEMP_DIR)
src/backend/manage_voice_models.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains functions to manage voice models.
3
+ """
4
+
5
+ from typings.extra import ModelsTable, ModelsTablePredicate
6
+
7
+ import os
8
+ import re
9
+ import shutil
10
+ import urllib.request
11
+ import zipfile
12
+
13
+ import gradio as gr
14
+
15
+ from backend.common import copy_files_to_new_folder, display_progress, json_load
16
+ from backend.exceptions import (
17
+ FileTypeError,
18
+ InputMissingError,
19
+ PathExistsError,
20
+ PathNotFoundError,
21
+ )
22
+
23
+ from common import RVC_MODELS_DIR
24
+
25
+ PUBLIC_MODELS = json_load(os.path.join(RVC_MODELS_DIR, "public_models.json"))
26
+
27
+
28
+ def get_current_models() -> list[str]:
29
+ """
30
+ Get the names of all saved voice models.
31
+
32
+ Returns
33
+ -------
34
+ list[str]
35
+ A list of names of all saved voice models.
36
+ """
37
+ models_list = os.listdir(RVC_MODELS_DIR)
38
+ items_to_remove = ["hubert_base.pt", "MODELS.txt", "public_models.json", "rmvpe.pt"]
39
+ return [item for item in models_list if item not in items_to_remove]
40
+
41
+
42
+ def load_public_models_table(
43
+ predicates: list[ModelsTablePredicate],
44
+ progress_bar: gr.Progress | None = None,
45
+ percentage: float = 0.0,
46
+ ) -> ModelsTable:
47
+ """
48
+ Load the public models table and filter it by the given predicates.
49
+
50
+ Parameters
51
+ ----------
52
+ predicates : list[ModelsTablePredicate]
53
+ List of predicates to filter the models table by.
54
+ progress_bar : gr.Progress, optional
55
+ Gradio progress bar to update.
56
+ percentage : float, default=0.0
57
+ Percentage to display in the progress bar.
58
+
59
+ Returns
60
+ -------
61
+ ModelsTable
62
+ The public models table, filtered by the given predicates.
63
+ """
64
+ models_table: ModelsTable = []
65
+ keys = ["name", "description", "tags", "credit", "added", "url"]
66
+ display_progress("[~] Loading public models table ...", percentage, progress_bar)
67
+ for model in PUBLIC_MODELS["voice_models"]:
68
+ if all([predicate(model) for predicate in predicates]):
69
+ models_table.append([model[key] for key in keys])
70
+
71
+ return models_table
72
+
73
+
74
+ def load_public_model_tags() -> list[str]:
75
+ """
76
+ Load the tags of all public voice models.
77
+
78
+ Returns
79
+ -------
80
+ list[str]
81
+ A list of all tags of public voice models.
82
+ """
83
+ return list(PUBLIC_MODELS["tags"].keys())
84
+
85
+
86
+ def filter_public_models_table(
87
+ tags: list[str],
88
+ query: str,
89
+ progress_bar: gr.Progress | None = None,
90
+ percentage: float = 0.0,
91
+ ) -> ModelsTable:
92
+ """
93
+ Filter the public models table by a set of tags and a search query.
94
+
95
+ The search query is matched against the name, description, tags, credit,
96
+ and added date of each model in the public models table.
97
+ Case insensitive search is performed.
98
+ If the search query is empty, the models table is filtered only by the tags.
99
+
100
+ Parameters
101
+ ----------
102
+ tags : list[str]
103
+ List of tags to filter the models table by.
104
+ query : str
105
+ Search query to filter the models table by.
106
+ progress_bar : gr.Progress, optional
107
+ Gradio progress bar to update.
108
+ percentage : float, default=0.0
109
+ Percentage to display in the progress bar.
110
+
111
+ Returns
112
+ -------
113
+ ModelsTable
114
+ The public models table, filtered by the given tags and the given query.
115
+ """
116
+ tags_predicate: ModelsTablePredicate = lambda model: all(
117
+ tag in model["tags"] for tag in tags
118
+ )
119
+ query_predicate: ModelsTablePredicate = lambda model: (
120
+ query.lower()
121
+ in f"{model['name']} {model['description']} {' '.join(model['tags'])} {model['credit']} {model['added']}"
122
+ .lower()
123
+ if query
124
+ else True
125
+ )
126
+
127
+ filter_fns = [tags_predicate, query_predicate]
128
+
129
+ return load_public_models_table(filter_fns, progress_bar, percentage)
130
+
131
+
132
+ def _extract_model_zip(extraction_folder: str, zip_name: str, remove_zip: bool) -> None:
133
+ """
134
+ Extract a voice model zip file to a directory.
135
+
136
+ Parameters
137
+ ----------
138
+ extraction_folder : str
139
+ The directory to extract the voice model to.
140
+ zip_name : str
141
+ The name of the zip file to extract.
142
+ remove_zip : bool
143
+ Whether to remove the zip file after extraction.
144
+
145
+ Raises
146
+ ------
147
+ PathNotFoundError
148
+ If no .pth model file is found in the extracted zip folder.
149
+ """
150
+ try:
151
+ os.makedirs(extraction_folder)
152
+ with zipfile.ZipFile(zip_name, "r") as zip_ref:
153
+ zip_ref.extractall(extraction_folder)
154
+
155
+ index_filepath, model_filepath = None, None
156
+ for root, _, files in os.walk(extraction_folder):
157
+ for name in files:
158
+ if (
159
+ name.endswith(".index")
160
+ and os.stat(os.path.join(root, name)).st_size > 1024 * 100
161
+ ):
162
+ index_filepath = os.path.join(root, name)
163
+
164
+ if (
165
+ name.endswith(".pth")
166
+ and os.stat(os.path.join(root, name)).st_size > 1024 * 1024 * 40
167
+ ):
168
+ model_filepath = os.path.join(root, name)
169
+
170
+ if not model_filepath:
171
+ raise PathNotFoundError(
172
+ "No .pth model file was found in the extracted zip folder."
173
+ )
174
+ # move model and index file to extraction folder
175
+
176
+ os.rename(
177
+ model_filepath,
178
+ os.path.join(extraction_folder, os.path.basename(model_filepath)),
179
+ )
180
+ if index_filepath:
181
+ os.rename(
182
+ index_filepath,
183
+ os.path.join(extraction_folder, os.path.basename(index_filepath)),
184
+ )
185
+
186
+ # remove any unnecessary nested folders
187
+ for filepath in os.listdir(extraction_folder):
188
+ if os.path.isdir(os.path.join(extraction_folder, filepath)):
189
+ shutil.rmtree(os.path.join(extraction_folder, filepath))
190
+
191
+ except Exception as e:
192
+ if os.path.isdir(extraction_folder):
193
+ shutil.rmtree(extraction_folder)
194
+ raise e
195
+ finally:
196
+ if remove_zip and os.path.exists(zip_name):
197
+ os.remove(zip_name)
198
+
199
+
200
+ def download_online_model(
201
+ url: str,
202
+ dir_name: str,
203
+ progress_bar: gr.Progress | None = None,
204
+ percentages: tuple[float, float] = (0.0, 0.5),
205
+ ) -> str:
206
+ """
207
+ Download a voice model from a given URL and extract it to a directory.
208
+
209
+ Parameters
210
+ ----------
211
+ url : str
212
+ The URL of the voice model to download.
213
+ dir_name : str
214
+ The name of the directory to extract the voice model to.
215
+ progress_bar : gr.Progress, optional
216
+ Gradio progress bar to update.
217
+ percentages : tuple[float, float], default=(0.0, 0.5)
218
+ Percentages to display in the progress bar.
219
+
220
+ Returns
221
+ -------
222
+ str
223
+ Success message.
224
+
225
+ Raises
226
+ ------
227
+ InputMissingError
228
+ If an URL or a voice model directory name is not given.
229
+ PathExistsError
230
+ If the voice model directory already exists.
231
+ """
232
+ if not url:
233
+ raise InputMissingError("Download link to model missing!")
234
+ if not dir_name:
235
+ raise InputMissingError("Model name missing!")
236
+ extraction_folder = os.path.join(RVC_MODELS_DIR, dir_name)
237
+ if os.path.exists(extraction_folder):
238
+ raise PathExistsError(
239
+ f'Voice model directory "{dir_name}" already exists! Choose a different'
240
+ " name for your voice model."
241
+ )
242
+ zip_name = url.split("/")[-1].split("?")[0]
243
+
244
+ # NOTE in case huggingface link is a direct link rather
245
+ # than a resolve link then convert it to a resolve link
246
+ url = re.sub(
247
+ r"https://huggingface.co/([^/]+)/([^/]+)/blob/(.*)",
248
+ r"https://huggingface.co/\1/\2/resolve/\3",
249
+ url,
250
+ )
251
+ if "pixeldrain.com" in url:
252
+ url = f"https://pixeldrain.com/api/file/{zip_name}"
253
+
254
+ display_progress(
255
+ f"[~] Downloading voice model with name '{dir_name}'...",
256
+ percentages[0],
257
+ progress_bar,
258
+ )
259
+
260
+ urllib.request.urlretrieve(url, zip_name)
261
+
262
+ display_progress("[~] Extracting zip file...", percentages[1], progress_bar)
263
+
264
+ _extract_model_zip(extraction_folder, zip_name, remove_zip=True)
265
+ return f"[+] Model with name '{dir_name}' successfully downloaded!"
266
+
267
+
268
+ def upload_local_model(
269
+ input_paths: list[str],
270
+ dir_name: str,
271
+ progress_bar: gr.Progress | None = None,
272
+ percentage: float = 0.0,
273
+ ) -> str:
274
+ """
275
+ Upload a voice model from either a local zip file or a local .pth file
276
+ and an optional index file.
277
+
278
+ Parameters
279
+ ----------
280
+ input_paths : list[str]
281
+ Paths of the local files to upload.
282
+ dir_name : str
283
+ The name of the directory to save the voice model files in.
284
+ progress_bar : gr.Progress, optional
285
+ Gradio progress bar to update.
286
+ percentage : float, default=0.0
287
+ Percentage to display in the progress bar.
288
+
289
+ Returns
290
+ -------
291
+ str
292
+ Success message.
293
+
294
+ Raises
295
+ ------
296
+ InputMissingError
297
+ If no file paths or no voice model directory name is given.
298
+ ValueError
299
+ If more than two file paths are given.
300
+ PathExistsError
301
+ If a voice model directory by the given name already exists.
302
+ FileTypeError
303
+ If a single uploaded file is not a .pth file or a .zip file.
304
+ If two uploaded files are not a .pth file and an .index file.
305
+ """
306
+ if not input_paths:
307
+ raise InputMissingError("No files selected!")
308
+ if len(input_paths) > 2:
309
+ raise ValueError("At most two files can be uploaded!")
310
+ if not dir_name:
311
+ raise InputMissingError("Model name missing!")
312
+ output_folder = os.path.join(RVC_MODELS_DIR, dir_name)
313
+ if os.path.exists(output_folder):
314
+ raise PathExistsError(
315
+ f'Voice model directory "{dir_name}" already exists! Choose a different'
316
+ " name for your voice model."
317
+ )
318
+ if len(input_paths) == 1:
319
+ input_path = input_paths[0]
320
+ if os.path.splitext(input_path)[1] == ".pth":
321
+ display_progress("[~] Copying .pth file ...", percentage, progress_bar)
322
+ copy_files_to_new_folder(input_paths, output_folder)
323
+ # NOTE a .pth file is actually itself a zip file
324
+ elif zipfile.is_zipfile(input_path):
325
+ display_progress("[~] Extracting zip file...", percentage, progress_bar)
326
+ _extract_model_zip(output_folder, input_path, remove_zip=False)
327
+ else:
328
+ raise FileTypeError(
329
+ "Only a .pth file or a .zip file can be uploaded by itself!"
330
+ )
331
+ else:
332
+ # sort two input files by extension type
333
+ input_names_sorted = sorted(input_paths, key=lambda f: os.path.splitext(f)[1])
334
+ index_name, pth_name = input_names_sorted
335
+ if (
336
+ os.path.splitext(pth_name)[1] == ".pth"
337
+ and os.path.splitext(index_name)[1] == ".index"
338
+ ):
339
+ display_progress(
340
+ "[~] Copying .pth file and index file ...", percentage, progress_bar
341
+ )
342
+ copy_files_to_new_folder(input_paths, output_folder)
343
+ else:
344
+ raise FileTypeError(
345
+ "Only a .pth file and an .index file can be uploaded together!"
346
+ )
347
+
348
+ return f"[+] Model with name '{dir_name}' successfully uploaded!"
349
+
350
+
351
+ def delete_models(
352
+ model_names: list[str],
353
+ progress_bar: gr.Progress | None = None,
354
+ percentage: float = 0.0,
355
+ ) -> str:
356
+ """
357
+ Delete one or more voice models.
358
+
359
+ Parameters
360
+ ----------
361
+ model_names : list[str]
362
+ Names of the models to delete.
363
+ progress_bar : gr.Progress, optional
364
+ Gradio progress bar to update.
365
+ percentage : float, default=0.0
366
+ Percentage to display in the progress bar.
367
+
368
+ Returns
369
+ -------
370
+ str
371
+ Success message.
372
+
373
+ Raises
374
+ ------
375
+ InputMissingError
376
+ If no model names are given.
377
+ PathNotFoundError
378
+ If a voice model directory does not exist.
379
+ """
380
+ if not model_names:
381
+ raise InputMissingError("No models selected!")
382
+ display_progress("[~] Deleting selected models ...", percentage, progress_bar)
383
+ for model_name in model_names:
384
+ model_dir = os.path.join(RVC_MODELS_DIR, model_name)
385
+ if not os.path.isdir(model_dir):
386
+ raise PathNotFoundError(
387
+ f'Voice model directory "{model_name}" does not exist!'
388
+ )
389
+ shutil.rmtree(model_dir)
390
+ models_names_formatted = [f"'{w}'" for w in model_names]
391
+ if len(model_names) == 1:
392
+ return f"[+] Model with name {models_names_formatted[0]} successfully deleted!"
393
+ else:
394
+ first_models = ", ".join(models_names_formatted[:-1])
395
+ last_model = models_names_formatted[-1]
396
+ return (
397
+ f"[+] Models with names {first_models} and {last_model} successfully"
398
+ " deleted!"
399
+ )
400
+
401
+
402
+ def delete_all_models(
403
+ progress_bar: gr.Progress | None = None, percentage: float = 0.0
404
+ ) -> str:
405
+ """
406
+ Delete all voice models.
407
+
408
+ Parameters
409
+ ----------
410
+ progress_bar : gr.Progress, optional
411
+ Gradio progress bar to update.
412
+ percentage : float, default=0.0
413
+ Percentage to display in the progress bar.
414
+
415
+ Returns
416
+ -------
417
+ str
418
+ Success message.
419
+ """
420
+ all_models = get_current_models()
421
+ display_progress("[~] Deleting all models ...", percentage, progress_bar)
422
+ for model_name in all_models:
423
+ model_dir = os.path.join(RVC_MODELS_DIR, model_name)
424
+ if os.path.isdir(model_dir):
425
+ shutil.rmtree(model_dir)
426
+ return "[+] All models successfully deleted!"
src/cli.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ from backend.generate_song_cover import run_pipeline
4
+
5
+ if __name__ == "__main__":
6
+ parser = argparse.ArgumentParser(
7
+ description="Generate a cover song in the song_output/id directory.",
8
+ add_help=True,
9
+ )
10
+ parser.add_argument(
11
+ "-i",
12
+ "--song-input",
13
+ type=str,
14
+ required=True,
15
+ help=(
16
+ "Link to a song on YouTube, the full path of a local audio file or a cached"
17
+ " input song"
18
+ ),
19
+ )
20
+ parser.add_argument(
21
+ "-dir",
22
+ "--rvc-dirname",
23
+ type=str,
24
+ required=True,
25
+ help=(
26
+ "Name of the folder in the models/rvc directory containing the RVC model"
27
+ " file and optional index file to use"
28
+ ),
29
+ )
30
+ parser.add_argument(
31
+ "-pv",
32
+ "--pitch-change-vocals",
33
+ type=int,
34
+ required=True,
35
+ help=(
36
+ "Shift the pitch of converted vocals only. Measured in octaves. Generally,"
37
+ " use 1 for male to female and -1 for vice-versa."
38
+ ),
39
+ )
40
+ parser.add_argument(
41
+ "-pall",
42
+ "--pitch-change-all",
43
+ type=int,
44
+ default=0,
45
+ help=(
46
+ "Shift pitch of converted vocals, backup vocals and instrumentals. Measured"
47
+ " in semi-tones. Altering this slightly reduces sound quality"
48
+ ),
49
+ )
50
+ parser.add_argument(
51
+ "-ir",
52
+ "--index-rate",
53
+ type=float,
54
+ default=0.5,
55
+ help=(
56
+ "A decimal number e.g. 0.5, used to reduce/resolve the timbre leakage"
57
+ " problem. If set to 1, more biased towards the timbre quality of the"
58
+ " training dataset"
59
+ ),
60
+ )
61
+ parser.add_argument(
62
+ "-fr",
63
+ "--filter-radius",
64
+ type=int,
65
+ default=3,
66
+ help=(
67
+ "A number between 0 and 7. If >=3: apply median filtering to the harvested"
68
+ " pitch results. The value represents the filter radius and can reduce"
69
+ " breathiness."
70
+ ),
71
+ )
72
+ parser.add_argument(
73
+ "-rms",
74
+ "--rms-mix-rate",
75
+ type=float,
76
+ default=0.25,
77
+ help=(
78
+ "A decimal number e.g. 0.25. Control how much to use the loudness of the"
79
+ " input vocals (0) or a fixed loudness (1)."
80
+ ),
81
+ )
82
+ parser.add_argument(
83
+ "-pro",
84
+ "--protect",
85
+ type=float,
86
+ default=0.33,
87
+ help=(
88
+ "A decimal number e.g. 0.33. Protect voiceless consonants and breath sounds"
89
+ " to prevent artifacts such as tearing in electronic music. Set to 0.5 to"
90
+ " disable. Decrease the value to increase protection, but it may reduce"
91
+ " indexing accuracy."
92
+ ),
93
+ )
94
+ parser.add_argument(
95
+ "-palgo",
96
+ "--pitch-detection-algo",
97
+ type=str,
98
+ default="rmvpe",
99
+ help=(
100
+ "Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother"
101
+ " vocals)."
102
+ ),
103
+ )
104
+ parser.add_argument(
105
+ "-hop",
106
+ "--crepe-hop-length",
107
+ type=int,
108
+ default=128,
109
+ help=(
110
+ "If pitch detection algo is mangio-crepe, controls how often it checks for"
111
+ " pitch changes in milliseconds. The higher the value, the faster the"
112
+ " conversion and less risk of voice cracks, but there is less pitch"
113
+ " accuracy. Recommended: 128."
114
+ ),
115
+ )
116
+ parser.add_argument(
117
+ "-rsize",
118
+ "--reverb-size",
119
+ type=float,
120
+ default=0.15,
121
+ help="Reverb room size between 0 and 1",
122
+ )
123
+ parser.add_argument(
124
+ "-rwet",
125
+ "--reverb-wetness",
126
+ type=float,
127
+ default=0.2,
128
+ help="Reverb wet level between 0 and 1",
129
+ )
130
+ parser.add_argument(
131
+ "-rdry",
132
+ "--reverb-dryness",
133
+ type=float,
134
+ default=0.8,
135
+ help="Reverb dry level between 0 and 1",
136
+ )
137
+ parser.add_argument(
138
+ "-rdamp",
139
+ "--reverb-damping",
140
+ type=float,
141
+ default=0.7,
142
+ help="Reverb damping between 0 and 1",
143
+ )
144
+ parser.add_argument(
145
+ "-mv",
146
+ "--main-vol",
147
+ type=int,
148
+ default=0,
149
+ help=(
150
+ "Volume change for converted main vocals. Measured in dB. Use -3 to"
151
+ " decrease by 3 dB and 3 to increase by 3 dB"
152
+ ),
153
+ )
154
+ parser.add_argument(
155
+ "-bv",
156
+ "--backup-vol",
157
+ type=int,
158
+ default=0,
159
+ help="Volume change for backup vocals. Measured in dB",
160
+ )
161
+ parser.add_argument(
162
+ "-iv",
163
+ "--inst-vol",
164
+ type=int,
165
+ default=0,
166
+ help="Volume change for instrumentals. Measured in dB",
167
+ )
168
+ parser.add_argument(
169
+ "-osr",
170
+ "--output-sr",
171
+ type=int,
172
+ default=44100,
173
+ help="Sample rate of output audio file.",
174
+ )
175
+ parser.add_argument(
176
+ "-oformat",
177
+ "--output-format",
178
+ type=str,
179
+ default="mp3",
180
+ help="format of output audio file",
181
+ )
182
+ parser.add_argument(
183
+ "-k",
184
+ "--keep-files",
185
+ action=argparse.BooleanOptionalAction,
186
+ default=True,
187
+ help=(
188
+ "Whether to keep song directory with intermediate audio files generated"
189
+ " during song cover generation."
190
+ ),
191
+ )
192
+ args = parser.parse_args()
193
+
194
+ rvc_dirname = args.rvc_dirname
195
+
196
+ song_cover_path = run_pipeline(
197
+ song_input=args.song_input,
198
+ voice_model=rvc_dirname,
199
+ pitch_change_vocals=args.pitch_change_vocals,
200
+ pitch_change_all=args.pitch_change_all,
201
+ index_rate=args.index_rate,
202
+ filter_radius=args.filter_radius,
203
+ rms_mix_rate=args.rms_mix_rate,
204
+ protect=args.protect,
205
+ f0_method=args.pitch_detection_algo,
206
+ crepe_hop_length=args.crepe_hop_length,
207
+ reverb_rm_size=args.reverb_size,
208
+ reverb_wet=args.reverb_wetness,
209
+ reverb_dry=args.reverb_dryness,
210
+ reverb_damping=args.reverb_damping,
211
+ main_gain=args.main_vol,
212
+ backup_gain=args.backup_vol,
213
+ inst_gain=args.inst_vol,
214
+ output_sr=args.output_sr,
215
+ output_format=args.output_format,
216
+ return_files=False,
217
+ progress_bar=None,
218
+ )
219
+ print(f"[+] Cover generated at {song_cover_path}")
src/common.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ """Common variables used in the Ultimate-RVC project."""
2
+
3
+ import os
4
+
5
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
6
+ MODELS_DIR = os.path.join(BASE_DIR, "models")
7
+ RVC_MODELS_DIR = os.path.join(MODELS_DIR, "rvc")
8
+ SEPARATOR_MODELS_DIR = os.path.join(MODELS_DIR, "audio_separator")
9
+ AUDIO_DIR = os.path.join(BASE_DIR, "audio")
10
+ GRADIO_TEMP_DIR = os.path.join(AUDIO_DIR, "gradio_temp")
src/frontend/common.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module containing common utility functions and classes for the frontend.
3
+ """
4
+
5
+ from typing import Any, Callable, Concatenate, Literal, Sequence
6
+ from typings.extra import (
7
+ ComponentVisibilityKwArgs,
8
+ DropdownChoices,
9
+ DropdownValue,
10
+ F0Method,
11
+ P,
12
+ T,
13
+ TextBoxArgs,
14
+ UpdateDropdownArgs,
15
+ )
16
+
17
+ from dataclasses import dataclass
18
+ from functools import partial
19
+
20
+ import gradio as gr
21
+ from gradio.components.base import Component
22
+ from gradio.events import Dependency
23
+
24
+ from backend.generate_song_cover import get_named_song_dirs, get_song_cover_name
25
+ from backend.manage_audio import get_output_audio
26
+
27
+ PROGRESS_BAR = gr.Progress()
28
+
29
+
30
+ def exception_harness(fun: Callable[P, T]) -> Callable[P, T]:
31
+ """
32
+ Wrap a function in a harness that catches exceptions
33
+ and re-raises them as instances of `gradio.Error`.
34
+
35
+ Parameters
36
+ ----------
37
+ fun : Callable[P, T]
38
+ The function to wrap.
39
+
40
+ Returns
41
+ -------
42
+ Callable[P, T]
43
+ The wrapped function.
44
+ """
45
+
46
+ def _wrapped_fun(*args: P.args, **kwargs: P.kwargs) -> T:
47
+ try:
48
+ return fun(*args, **kwargs)
49
+ except Exception as e:
50
+ raise gr.Error(str(e))
51
+
52
+ return _wrapped_fun
53
+
54
+
55
+ def confirmation_harness(fun: Callable[P, T]) -> Callable[Concatenate[bool, P], T]:
56
+ """
57
+ Wrap a function in a harness that requires a confirmation
58
+ before executing and catches exceptions,
59
+ re-raising them as instances of `gradio.Error`.
60
+
61
+ Parameters
62
+ ----------
63
+ fun : Callable[P, T]
64
+ The function to wrap.
65
+
66
+ Returns
67
+ -------
68
+ Callable[Concatenate[bool, P], T]
69
+ The wrapped function.
70
+ """
71
+
72
+ def _wrapped_fun(confirm: bool, *args: P.args, **kwargs: P.kwargs) -> T:
73
+ if confirm:
74
+ return exception_harness(fun)(*args, **kwargs)
75
+ else:
76
+ raise gr.Error("Confirmation missing!")
77
+
78
+ return _wrapped_fun
79
+
80
+
81
+ def confirm_box_js(msg: str) -> str:
82
+ """
83
+ Generate JavaScript code for a confirmation box.
84
+
85
+ Parameters
86
+ ----------
87
+ msg : str
88
+ Message to display in the confirmation box.
89
+
90
+ Returns
91
+ -------
92
+ str
93
+ JavaScript code for the confirmation box.
94
+ """
95
+ formatted_msg = f"'{msg}'"
96
+ return f"(x) => confirm({formatted_msg})"
97
+
98
+
99
+ def identity(x: T) -> T:
100
+ """
101
+ Identity function.
102
+
103
+ Parameters
104
+ ----------
105
+ x : T
106
+ Value to return.
107
+
108
+ Returns
109
+ -------
110
+ T
111
+ The value.
112
+ """
113
+ return x
114
+
115
+
116
+ def update_value(x: Any) -> dict[str, Any]:
117
+ """
118
+ Update the value of a component.
119
+
120
+ Parameters
121
+ ----------
122
+ x : Any
123
+ New value for the component.
124
+
125
+ Returns
126
+ -------
127
+ dict[str, Any]
128
+ Dictionary which updates the value of the component.
129
+ """
130
+ return gr.update(value=x)
131
+
132
+
133
+ def update_dropdowns(
134
+ fn: Callable[P, DropdownChoices],
135
+ num_components: int,
136
+ value: DropdownValue = None,
137
+ value_indices: Sequence[int] = [],
138
+ *args: P.args,
139
+ **kwargs: P.kwargs,
140
+ ) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
141
+ """
142
+ Update the choices and optionally the value of one or more dropdown components.
143
+
144
+ Parameters
145
+ ----------
146
+ fn : Callable[P, DropdownChoices]
147
+ Function to get updated choices for the dropdown components.
148
+ num_components : int
149
+ Number of dropdown components to update.
150
+ value : DropdownValue, optional
151
+ New value for dropdown components.
152
+ value_indices : Sequence[int], default=[]
153
+ Indices of dropdown components to update the value for.
154
+ args : P.args
155
+ Positional arguments to pass to the function used to update choices.
156
+ kwargs : P.kwargs
157
+ Keyword arguments to pass to the function used to update choices.
158
+
159
+ Returns
160
+ -------
161
+ gr.Dropdown|tuple[gr.Dropdown,...]
162
+ Updated dropdown component or components.
163
+
164
+ Raises
165
+ ------
166
+ ValueError
167
+ If value indices are not unique or if an index exceeds the number of components.
168
+ """
169
+ if len(value_indices) != len(set(value_indices)):
170
+ raise ValueError("Value indices must be unique.")
171
+ if value_indices and max(value_indices) >= num_components:
172
+ raise ValueError(
173
+ "Index of a component to update value for exceeds number of components."
174
+ )
175
+ updated_choices = fn(*args, **kwargs)
176
+ update_args: list[UpdateDropdownArgs] = [
177
+ {"choices": updated_choices} for _ in range(num_components)
178
+ ]
179
+ for index in value_indices:
180
+ update_args[index]["value"] = value
181
+ if len(update_args) == 1:
182
+ # NOTE This is a workaround as gradio does not support
183
+ # singleton tuples for components.
184
+ return gr.Dropdown(**update_args[0])
185
+ return tuple(gr.Dropdown(**update_arg) for update_arg in update_args)
186
+
187
+
188
+ def update_cached_input_songs(
189
+ num_components: int, value: DropdownValue = None, value_indices: Sequence[int] = []
190
+ ) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
191
+ """
192
+ Updates the choices of one or more dropdown components
193
+ to the current set of cached input songs.
194
+
195
+ Optionally updates the default value of one or more of these components.
196
+
197
+ Parameters
198
+ ----------
199
+ num_components : int
200
+ Number of dropdown components to update.
201
+ value : DropdownValue, optional
202
+ New value for dropdown components.
203
+ value_indices : Sequence[int], default=[]
204
+ Indices of dropdown components to update the value for.
205
+
206
+ Returns
207
+ -------
208
+ gr.Dropdown|tuple[gr.Dropdown,...]
209
+ Updated dropdown component or components.
210
+ """
211
+ return update_dropdowns(get_named_song_dirs, num_components, value, value_indices)
212
+
213
+
214
+ def update_output_audio(
215
+ num_components: int, value: DropdownValue = None, value_indices: Sequence[int] = []
216
+ ) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
217
+ """
218
+ Updates the choices of one or more dropdown
219
+ components to the current set of output audio files.
220
+
221
+ Optionally updates the default value of one or more of these components.
222
+
223
+ Parameters
224
+ ----------
225
+ num_components : int
226
+ Number of dropdown components to update.
227
+ value : DropdownValue, optional
228
+ New value for dropdown components.
229
+ value_indices : Sequence[int], default=[]
230
+ Indices of dropdown components to update the value for.
231
+
232
+ Returns
233
+ -------
234
+ gr.Dropdown|tuple[gr.Dropdown,...]
235
+ Updated dropdown component or components.
236
+ """
237
+ return update_dropdowns(get_output_audio, num_components, value, value_indices)
238
+
239
+
240
+ def toggle_visible_component(
241
+ num_components: int, visible_index: int
242
+ ) -> dict[str, Any] | tuple[dict[str, Any], ...]:
243
+ """
244
+ Reveal a single component from a set of components.
245
+ All other components are hidden.
246
+
247
+ Parameters
248
+ ----------
249
+ num_components : int
250
+ Number of components to set visibility for.
251
+ visible_index : int
252
+ Index of the component to reveal.
253
+
254
+ Returns
255
+ -------
256
+ dict|tuple[dict,...]
257
+ A single dictionary or a tuple of dictionaries
258
+ that update the visibility of the components.
259
+ """
260
+ if visible_index >= num_components:
261
+ raise ValueError("Visible index must be less than number of components.")
262
+ update_args: list[ComponentVisibilityKwArgs] = [
263
+ {"visible": False, "value": None} for _ in range(num_components)
264
+ ]
265
+ update_args[visible_index]["visible"] = True
266
+ if num_components == 1:
267
+ return gr.update(**update_args[0])
268
+ return tuple(gr.update(**update_arg) for update_arg in update_args)
269
+
270
+
271
+ def _toggle_component_interactivity(
272
+ num_components: int, interactive: bool
273
+ ) -> dict[str, Any] | tuple[dict[str, Any], ...]:
274
+ """
275
+ Toggle interactivity of one or more components.
276
+
277
+ Parameters
278
+ ----------
279
+ num_components : int
280
+ Number of components to toggle interactivity for.
281
+ interactive : bool
282
+ Whether to make the components interactive or not.
283
+
284
+ Returns
285
+ -------
286
+ dict|tuple[dict,...]
287
+ A single dictionary or a tuple of dictionaries
288
+ that update the interactivity of the components.
289
+ """
290
+ if num_components == 1:
291
+ return gr.update(interactive=interactive)
292
+ return tuple(gr.update(interactive=interactive) for _ in range(num_components))
293
+
294
+
295
+ def show_hop_slider(pitch_detection_algo: F0Method) -> gr.Slider:
296
+ """
297
+ Show or hide a slider component based on the given pitch extraction algorithm.
298
+
299
+ Parameters
300
+ ----------
301
+ pitch_detection_algo : F0Method
302
+ Pitch detection algorithm to determine visibility of the slider.
303
+
304
+ Returns
305
+ -------
306
+ gr.Slider
307
+ Slider component with visibility set accordingly.
308
+ """
309
+ if pitch_detection_algo == "mangio-crepe":
310
+ return gr.Slider(visible=True)
311
+ else:
312
+ return gr.Slider(visible=False)
313
+
314
+
315
+ def update_song_cover_name(
316
+ mixed_vocals: str | None = None,
317
+ song_dir: str | None = None,
318
+ voice_model: str | None = None,
319
+ update_placeholder: bool = False,
320
+ ) -> gr.Textbox:
321
+ """
322
+ Updates a textbox component so that it displays a suitable name for a cover of
323
+ a given song.
324
+
325
+ If the path of an existing song directory is provided, the original song
326
+ name is inferred from that directory. If a voice model is not provided
327
+ but the path of an existing song directory and the path of a mixed vocals file
328
+ in that directory are provided, then the voice model is inferred from
329
+ the mixed vocals file.
330
+
331
+
332
+ Parameters
333
+ ----------
334
+ mixed_vocals : str, optional
335
+ The path to a mixed vocals file.
336
+ song_dir : str, optional
337
+ The path to a song directory.
338
+ voice_model : str, optional
339
+ The name of a voice model.
340
+ update_placeholder : bool, default=False
341
+ Whether to update the placeholder text of the textbox component.
342
+
343
+ Returns
344
+ -------
345
+ gr.Textbox
346
+ Updated textbox component.
347
+ """
348
+ update_args: TextBoxArgs = {}
349
+ update_key = "placeholder" if update_placeholder else "value"
350
+ if mixed_vocals or song_dir or voice_model:
351
+ name = exception_harness(get_song_cover_name)(
352
+ mixed_vocals, song_dir, voice_model, progress_bar=PROGRESS_BAR
353
+ )
354
+ update_args[update_key] = name
355
+ else:
356
+ update_args[update_key] = None
357
+ return gr.Textbox(**update_args)
358
+
359
+
360
+ @dataclass
361
+ class EventArgs:
362
+ """
363
+ Data class to store arguments for setting up event listeners.
364
+
365
+ Attributes
366
+ ----------
367
+ fn : Callable[..., Any]
368
+ Function to call when an event is triggered.
369
+ inputs : Sequence[Component], optional
370
+ Components to serve as inputs to the function.
371
+ outputs : Sequence[Component], optional
372
+ Components where to store the outputs of the function.
373
+ name : Literal["click", "success", "then"], default="success"
374
+ Name of the event to listen for.
375
+ show_progress : Literal["full", "minimal", "hidden"], default="full"
376
+ Level of progress bar to show when the event is triggered.
377
+ """
378
+
379
+ fn: Callable[..., Any]
380
+ inputs: Sequence[Component] | None = None
381
+ outputs: Sequence[Component] | None = None
382
+ name: Literal["click", "success", "then"] = "success"
383
+ show_progress: Literal["full", "minimal", "hidden"] = "full"
384
+
385
+
386
+ def setup_consecutive_event_listeners(
387
+ component: Component, event_args_list: list[EventArgs]
388
+ ) -> Dependency | Component:
389
+ """
390
+ Set up a chain of event listeners on a component.
391
+
392
+ Parameters
393
+ ----------
394
+ component : Component
395
+ The component to set up event listeners on.
396
+ event_args_list : list[EventArgs]
397
+ List of event arguments to set up event listeners with.
398
+
399
+ Returns
400
+ -------
401
+ Dependency | Component
402
+ The last dependency in the chain of event listeners.
403
+ """
404
+ if len(event_args_list) == 0:
405
+ raise ValueError("Event args list must not be empty.")
406
+ dependency = component
407
+ for event_args in event_args_list:
408
+ event_listener = getattr(dependency, event_args.name)
409
+ dependency = event_listener(
410
+ event_args.fn,
411
+ inputs=event_args.inputs,
412
+ outputs=event_args.outputs,
413
+ show_progress=event_args.show_progress,
414
+ )
415
+ return dependency
416
+
417
+
418
+ def setup_consecutive_event_listeners_with_toggled_interactivity(
419
+ component: Component,
420
+ event_args_list: list[EventArgs],
421
+ toggled_components: Sequence[Component],
422
+ ) -> Dependency | Component:
423
+ """
424
+ Set up a chain of event listeners on a component
425
+ with interactivity toggled for a set of other components.
426
+
427
+ While the chain of event listeners is being executed,
428
+ the other components are made non-interactive.
429
+ When the chain of event listeners is completed,
430
+ the other components are made interactive again.
431
+
432
+ Parameters
433
+ ----------
434
+ component : Component
435
+ The component to set up event listeners on.
436
+
437
+ event_args_list : list[EventArgs]
438
+ List of event arguments to set up event listeners with.
439
+
440
+ toggled_components : Sequence[Component]
441
+ Components to toggle interactivity for.
442
+
443
+ Returns
444
+ -------
445
+ Dependency | Component
446
+ The last dependency in the chain of event listeners.
447
+ """
448
+ if len(event_args_list) == 0:
449
+ raise ValueError("Event args list must not be empty.")
450
+
451
+ disable_event_args = EventArgs(
452
+ partial(_toggle_component_interactivity, len(toggled_components), False),
453
+ outputs=toggled_components,
454
+ name="click",
455
+ show_progress="hidden",
456
+ )
457
+ enable_event_args = EventArgs(
458
+ partial(_toggle_component_interactivity, len(toggled_components), True),
459
+ outputs=toggled_components,
460
+ name="then",
461
+ show_progress="hidden",
462
+ )
463
+ event_args_list_augmented = (
464
+ [disable_event_args] + event_args_list + [enable_event_args]
465
+ )
466
+ return setup_consecutive_event_listeners(component, event_args_list_augmented)
src/frontend/tabs/manage_audio.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the code for the "Delete audio" tab.
3
+ """
4
+
5
+ from functools import partial
6
+
7
+ import gradio as gr
8
+
9
+ from backend.manage_audio import (
10
+ delete_all_audio,
11
+ delete_all_intermediate_audio,
12
+ delete_all_output_audio,
13
+ delete_intermediate_audio,
14
+ delete_output_audio,
15
+ )
16
+
17
+ from frontend.common import (
18
+ PROGRESS_BAR,
19
+ confirm_box_js,
20
+ confirmation_harness,
21
+ identity,
22
+ update_cached_input_songs,
23
+ update_output_audio,
24
+ )
25
+
26
+
27
+ def render(
28
+ dummy_deletion_checkbox: gr.Checkbox,
29
+ delete_confirmation: gr.State,
30
+ song_dir_dropdowns: list[gr.Dropdown],
31
+ cached_input_songs_dropdown_1click: gr.Dropdown,
32
+ cached_input_songs_dropdown_multi: gr.Dropdown,
33
+ intermediate_audio_to_delete: gr.Dropdown,
34
+ output_audio_to_delete: gr.Dropdown,
35
+ ) -> None:
36
+ """
37
+ Render "Delete audio" tab.
38
+
39
+ Parameters
40
+ ----------
41
+ dummy_deletion_checkbox : gr.Checkbox
42
+ Dummy component needed for deletion confirmation in the
43
+ "Delete audio" tab and the "Manage models" tab.
44
+ delete_confirmation : gr.State
45
+ Component storing deletion confirmation status in the
46
+ "Delete audio" tab and the "Manage models" tab.
47
+ song_dir_dropdowns : list[gr.Dropdown]
48
+ Dropdowns for selecting song directories in the
49
+ "Multi-step generation" tab.
50
+ cached_input_songs_dropdown_1click : gr.Dropdown
51
+ Dropdown for selecting cached input songs in the
52
+ "One-click generation" tab
53
+ cached_input_songs_dropdown_multi : gr.Dropdown
54
+ Dropdown for selecting cached input songs in the
55
+ "Multi-step generation" tab
56
+ intermediate_audio_to_delete : gr.Dropdown
57
+ Dropdown for selecting intermediate audio files to delete in the
58
+ "Delete audio" tab.
59
+ output_audio_to_delete : gr.Dropdown
60
+ Dropdown for selecting output audio files to delete in the
61
+ "Delete audio" tab.
62
+ """
63
+ with gr.Tab("Delete audio"):
64
+ with gr.Accordion("Intermediate audio", open=False):
65
+ with gr.Row():
66
+ with gr.Column():
67
+ intermediate_audio_to_delete.render()
68
+ delete_intermediate_audio_btn = gr.Button(
69
+ "Delete selected", variant="secondary"
70
+ )
71
+ delete_all_intermediate_audio_btn = gr.Button(
72
+ "Delete all", variant="primary"
73
+ )
74
+ with gr.Row():
75
+ intermediate_audio_delete_msg = gr.Textbox(
76
+ label="Output message", interactive=False
77
+ )
78
+ with gr.Accordion("Output audio", open=False):
79
+ with gr.Row():
80
+ with gr.Column():
81
+ output_audio_to_delete.render()
82
+ delete_output_audio_btn = gr.Button(
83
+ "Delete selected", variant="secondary"
84
+ )
85
+ delete_all_output_audio_btn = gr.Button(
86
+ "Delete all", variant="primary"
87
+ )
88
+ with gr.Row():
89
+ output_audio_delete_msg = gr.Textbox(
90
+ label="Output message", interactive=False
91
+ )
92
+ with gr.Accordion("All audio", open=True):
93
+ with gr.Row():
94
+ delete_all_audio_btn = gr.Button("Delete", variant="primary")
95
+ delete_all_audio_msg = gr.Textbox(
96
+ label="Output message", interactive=False
97
+ )
98
+
99
+ delete_intermediate_audio_click = delete_intermediate_audio_btn.click(
100
+ identity,
101
+ inputs=dummy_deletion_checkbox,
102
+ outputs=delete_confirmation,
103
+ js=confirm_box_js(
104
+ "Are you sure you want to delete intermediate audio files for the"
105
+ " selected songs?"
106
+ ),
107
+ show_progress="hidden",
108
+ ).then(
109
+ partial(
110
+ confirmation_harness(delete_intermediate_audio),
111
+ progress_bar=PROGRESS_BAR,
112
+ ),
113
+ inputs=[delete_confirmation, intermediate_audio_to_delete],
114
+ outputs=intermediate_audio_delete_msg,
115
+ )
116
+
117
+ delete_all_intermediate_audio_click = delete_all_intermediate_audio_btn.click(
118
+ identity,
119
+ inputs=dummy_deletion_checkbox,
120
+ outputs=delete_confirmation,
121
+ js=confirm_box_js(
122
+ "Are you sure you want to delete all intermediate audio files?"
123
+ ),
124
+ show_progress="hidden",
125
+ ).then(
126
+ partial(
127
+ confirmation_harness(delete_all_intermediate_audio),
128
+ progress_bar=PROGRESS_BAR,
129
+ ),
130
+ inputs=delete_confirmation,
131
+ outputs=intermediate_audio_delete_msg,
132
+ )
133
+
134
+ delete_output_audio_click = delete_output_audio_btn.click(
135
+ identity,
136
+ inputs=dummy_deletion_checkbox,
137
+ outputs=delete_confirmation,
138
+ js=confirm_box_js(
139
+ "Are you sure you want to delete the selected output audio files?"
140
+ ),
141
+ show_progress="hidden",
142
+ ).then(
143
+ partial(
144
+ confirmation_harness(delete_output_audio),
145
+ progress_bar=PROGRESS_BAR,
146
+ ),
147
+ inputs=[delete_confirmation, output_audio_to_delete],
148
+ outputs=output_audio_delete_msg,
149
+ )
150
+
151
+ delete_all_output_audio_click = delete_all_output_audio_btn.click(
152
+ identity,
153
+ inputs=dummy_deletion_checkbox,
154
+ outputs=delete_confirmation,
155
+ js=confirm_box_js(
156
+ "Are you sure you want to delete all output audio files?"
157
+ ),
158
+ show_progress="hidden",
159
+ ).then(
160
+ partial(
161
+ confirmation_harness(delete_all_output_audio), progress_bar=PROGRESS_BAR
162
+ ),
163
+ inputs=delete_confirmation,
164
+ outputs=output_audio_delete_msg,
165
+ )
166
+
167
+ delete_all_audio_click = delete_all_audio_btn.click(
168
+ identity,
169
+ inputs=dummy_deletion_checkbox,
170
+ outputs=delete_confirmation,
171
+ js=confirm_box_js("Are you sure you want to delete all audio files?"),
172
+ show_progress="hidden",
173
+ ).then(
174
+ partial(confirmation_harness(delete_all_audio), progress_bar=PROGRESS_BAR),
175
+ inputs=delete_confirmation,
176
+ outputs=delete_all_audio_msg,
177
+ )
178
+
179
+ for click_event in [
180
+ delete_intermediate_audio_click,
181
+ delete_all_intermediate_audio_click,
182
+ ]:
183
+ click_event.success(
184
+ partial(
185
+ update_cached_input_songs, 3 + len(song_dir_dropdowns), [], [0]
186
+ ),
187
+ outputs=[
188
+ intermediate_audio_to_delete,
189
+ cached_input_songs_dropdown_1click,
190
+ cached_input_songs_dropdown_multi,
191
+ *song_dir_dropdowns,
192
+ ],
193
+ show_progress="hidden",
194
+ )
195
+
196
+ for click_event in [delete_output_audio_click, delete_all_output_audio_click]:
197
+ click_event.success(
198
+ partial(update_output_audio, 1, [], [0]),
199
+ outputs=[output_audio_to_delete],
200
+ show_progress="hidden",
201
+ )
202
+
203
+ delete_all_audio_click.success(
204
+ partial(update_output_audio, 1, [], [0]),
205
+ outputs=[output_audio_to_delete],
206
+ show_progress="hidden",
207
+ ).then(
208
+ partial(update_cached_input_songs, 3 + len(song_dir_dropdowns), [], [0]),
209
+ outputs=[
210
+ intermediate_audio_to_delete,
211
+ cached_input_songs_dropdown_1click,
212
+ cached_input_songs_dropdown_multi,
213
+ *song_dir_dropdowns,
214
+ ],
215
+ show_progress="hidden",
216
+ )
src/frontend/tabs/manage_models.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the code for the "Manage models" tab.
3
+ """
4
+
5
+ from typings.extra import DropdownValue
6
+
7
+ from functools import partial
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+
12
+ from backend.manage_voice_models import (
13
+ delete_all_models,
14
+ delete_models,
15
+ download_online_model,
16
+ filter_public_models_table,
17
+ get_current_models,
18
+ load_public_model_tags,
19
+ load_public_models_table,
20
+ upload_local_model,
21
+ )
22
+
23
+ from frontend.common import (
24
+ PROGRESS_BAR,
25
+ confirm_box_js,
26
+ confirmation_harness,
27
+ exception_harness,
28
+ identity,
29
+ update_dropdowns,
30
+ )
31
+
32
+
33
+ def _update_model_lists(
34
+ num_components: int, value: DropdownValue = None, value_indices: list[int] = []
35
+ ) -> gr.Dropdown | tuple[gr.Dropdown, ...]:
36
+ """
37
+ Updates the choices of one or more dropdown
38
+ components to the current set of voice models.
39
+
40
+ Optionally updates the default value of one or more of these components.
41
+
42
+ Parameters
43
+ ----------
44
+ num_components : int
45
+ Number of dropdown components to update.
46
+ value : DropdownValue, optional
47
+ New value for dropdown components.
48
+ value_indices : list[int], default=[]
49
+ Indices of dropdown components to update the value for.
50
+
51
+ Returns
52
+ -------
53
+ gr.Dropdown | tuple[gr.Dropdown, ...]
54
+ Updated dropdown component or components.
55
+ """
56
+ return update_dropdowns(get_current_models, num_components, value, value_indices)
57
+
58
+
59
+ def _filter_public_models_table_harness(
60
+ tags: list[str], query: str, progress_bar: gr.Progress
61
+ ) -> gr.Dataframe:
62
+ """
63
+ Filter the public models table based on tags and search query.
64
+
65
+ Parameters
66
+ ----------
67
+ tags : list[str]
68
+ Tags to filter the table by.
69
+ query : str
70
+ Search query to filter the table by.
71
+ progress_bar : gr.Progress
72
+ Progress bar to display progress.
73
+
74
+ Returns
75
+ -------
76
+ gr.Dataframe
77
+ The filtered public models table rendered in a Gradio dataframe.
78
+ """
79
+ models_table = filter_public_models_table(tags, query, progress_bar)
80
+ return gr.Dataframe(value=models_table)
81
+
82
+
83
+ def _pub_dl_autofill(
84
+ pub_models: pd.DataFrame, event: gr.SelectData
85
+ ) -> tuple[gr.Textbox, gr.Textbox]:
86
+ """
87
+ Autofill download link and model name based on selected row in public models table.
88
+
89
+ Parameters
90
+ ----------
91
+ pub_models : pd.DataFrame
92
+ Public models table.
93
+ event : gr.SelectData
94
+ Event containing the selected row.
95
+
96
+ Returns
97
+ -------
98
+ download_link : gr.Textbox
99
+ Autofilled download link.
100
+ model_name : gr.Textbox
101
+ Autofilled model name.
102
+ """
103
+ event_index = event.index[0]
104
+ url_str = pub_models.loc[event_index, "URL"]
105
+ model_str = pub_models.loc[event_index, "Model Name"]
106
+
107
+ return gr.Textbox(value=url_str), gr.Textbox(value=model_str)
108
+
109
+
110
+ def render(
111
+ dummy_deletion_checkbox: gr.Checkbox,
112
+ delete_confirmation: gr.State,
113
+ rvc_models_to_delete: gr.Dropdown,
114
+ rvc_model_1click: gr.Dropdown,
115
+ rvc_model_multi: gr.Dropdown,
116
+ ) -> None:
117
+ """
118
+ Render "Manage models" tab.
119
+
120
+ Parameters
121
+ ----------
122
+ dummy_deletion_checkbox : gr.Checkbox
123
+ Dummy component needed for deletion confirmation in the
124
+ "Manage audio" tab and the "Manage models" tab.
125
+ delete_confirmation : gr.State
126
+ Component storing deletion confirmation status in the
127
+ "Manage audio" tab and the "Manage models" tab.
128
+ rvc_models_to_delete : gr.Dropdown
129
+ Dropdown for selecting models to delete in the
130
+ "Manage models" tab.
131
+ rvc_model_1click : gr.Dropdown
132
+ Dropdown for selecting models in the "One-click generation" tab.
133
+ rvc_model_multi : gr.Dropdown
134
+ Dropdown for selecting models in the "Multi-step generation" tab.
135
+ """
136
+
137
+ # Download tab
138
+ with gr.Tab("Download model"):
139
+
140
+ with gr.Accordion("View public models table", open=False):
141
+
142
+ gr.Markdown("")
143
+ gr.Markdown("HOW TO USE")
144
+ gr.Markdown("- Filter models using tags or search bar")
145
+ gr.Markdown("- Select a row to autofill the download link and model name")
146
+
147
+ filter_tags = gr.CheckboxGroup(
148
+ value=[],
149
+ label="Show voice models with tags",
150
+ choices=load_public_model_tags(),
151
+ )
152
+ search_query = gr.Textbox(label="Search")
153
+
154
+ public_models_table = gr.DataFrame(
155
+ value=load_public_models_table([]),
156
+ headers=["Model Name", "Description", "Tags", "Credit", "Added", "URL"],
157
+ label="Available Public Models",
158
+ interactive=False,
159
+ )
160
+
161
+ with gr.Row():
162
+ model_zip_link = gr.Textbox(
163
+ label="Download link to model",
164
+ info=(
165
+ "Should point to a zip file containing a .pth model file and an"
166
+ " optional .index file."
167
+ ),
168
+ )
169
+ model_name = gr.Textbox(
170
+ label="Model name", info="Enter a unique name for the model."
171
+ )
172
+
173
+ with gr.Row():
174
+ download_btn = gr.Button("Download 🌐", variant="primary", scale=19)
175
+ dl_output_message = gr.Textbox(
176
+ label="Output message", interactive=False, scale=20
177
+ )
178
+
179
+ download_button_click = download_btn.click(
180
+ partial(
181
+ exception_harness(download_online_model), progress_bar=PROGRESS_BAR
182
+ ),
183
+ inputs=[model_zip_link, model_name],
184
+ outputs=dl_output_message,
185
+ )
186
+
187
+ public_models_table.select(
188
+ _pub_dl_autofill,
189
+ inputs=public_models_table,
190
+ outputs=[model_zip_link, model_name],
191
+ show_progress="hidden",
192
+ )
193
+ search_query.change(
194
+ partial(
195
+ exception_harness(_filter_public_models_table_harness),
196
+ progress_bar=PROGRESS_BAR,
197
+ ),
198
+ inputs=[filter_tags, search_query],
199
+ outputs=public_models_table,
200
+ show_progress="hidden",
201
+ )
202
+ filter_tags.select(
203
+ partial(
204
+ exception_harness(_filter_public_models_table_harness),
205
+ progress_bar=PROGRESS_BAR,
206
+ ),
207
+ inputs=[filter_tags, search_query],
208
+ outputs=public_models_table,
209
+ show_progress="hidden",
210
+ )
211
+
212
+ # Upload tab
213
+ with gr.Tab("Upload model"):
214
+ with gr.Accordion("HOW TO USE"):
215
+ gr.Markdown(
216
+ "- Find locally trained RVC v2 model file (weights folder) and optional"
217
+ " index file (logs/[name] folder)"
218
+ )
219
+ gr.Markdown(
220
+ "- Upload model file and optional index file directly or compress into"
221
+ " a zip file and upload that"
222
+ )
223
+ gr.Markdown("- Enter a unique name for the model")
224
+ gr.Markdown("- Click 'Upload model'")
225
+
226
+ with gr.Row():
227
+ with gr.Column():
228
+ model_files = gr.File(label="Files", file_count="multiple")
229
+
230
+ local_model_name = gr.Textbox(label="Model name")
231
+
232
+ with gr.Row():
233
+ model_upload_button = gr.Button("Upload model", variant="primary", scale=19)
234
+ local_upload_output_message = gr.Textbox(
235
+ label="Output message", interactive=False, scale=20
236
+ )
237
+ model_upload_button_click = model_upload_button.click(
238
+ partial(
239
+ exception_harness(upload_local_model), progress_bar=PROGRESS_BAR
240
+ ),
241
+ inputs=[model_files, local_model_name],
242
+ outputs=local_upload_output_message,
243
+ )
244
+
245
+ with gr.Tab("Delete models"):
246
+ with gr.Row():
247
+ with gr.Column():
248
+ rvc_models_to_delete.render()
249
+ with gr.Column():
250
+ rvc_models_deleted_message = gr.Textbox(
251
+ label="Output message", interactive=False
252
+ )
253
+
254
+ with gr.Row():
255
+ with gr.Column():
256
+ delete_models_button = gr.Button(
257
+ "Delete selected models", variant="secondary"
258
+ )
259
+ delete_all_models_button = gr.Button(
260
+ "Delete all models", variant="primary"
261
+ )
262
+ with gr.Column():
263
+ pass
264
+ delete_models_button_click = delete_models_button.click(
265
+ # NOTE not sure why, but in order for subsequent event listener
266
+ # to trigger, changes coming from the js code
267
+ # have to be routed through an identity function which takes as
268
+ # input some dummy component of type bool.
269
+ identity,
270
+ inputs=dummy_deletion_checkbox,
271
+ outputs=delete_confirmation,
272
+ js=confirm_box_js("Are you sure you want to delete the selected models?"),
273
+ show_progress="hidden",
274
+ ).then(
275
+ partial(confirmation_harness(delete_models), progress_bar=PROGRESS_BAR),
276
+ inputs=[delete_confirmation, rvc_models_to_delete],
277
+ outputs=rvc_models_deleted_message,
278
+ )
279
+
280
+ delete_all_models_btn_click = delete_all_models_button.click(
281
+ identity,
282
+ inputs=dummy_deletion_checkbox,
283
+ outputs=delete_confirmation,
284
+ js=confirm_box_js("Are you sure you want to delete all models?"),
285
+ show_progress="hidden",
286
+ ).then(
287
+ partial(confirmation_harness(delete_all_models), progress_bar=PROGRESS_BAR),
288
+ inputs=delete_confirmation,
289
+ outputs=rvc_models_deleted_message,
290
+ )
291
+
292
+ for click_event in [
293
+ download_button_click,
294
+ model_upload_button_click,
295
+ delete_models_button_click,
296
+ delete_all_models_btn_click,
297
+ ]:
298
+ click_event.success(
299
+ partial(_update_model_lists, 3, [], [2]),
300
+ outputs=[rvc_model_1click, rvc_model_multi, rvc_models_to_delete],
301
+ show_progress="hidden",
302
+ )
src/frontend/tabs/multi_step_generation.py ADDED
@@ -0,0 +1,991 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the code for the "Multi-step generation" tab.
3
+ """
4
+
5
+ from typings.extra import TransferUpdateArgs
6
+
7
+ from functools import partial
8
+
9
+ import gradio as gr
10
+
11
+ from backend.generate_song_cover import (
12
+ convert_vocals,
13
+ dereverb_vocals,
14
+ mix_song_cover,
15
+ pitch_shift_background,
16
+ postprocess_vocals,
17
+ retrieve_song,
18
+ separate_main_vocals,
19
+ separate_vocals,
20
+ )
21
+
22
+ from frontend.common import (
23
+ PROGRESS_BAR,
24
+ EventArgs,
25
+ exception_harness,
26
+ setup_consecutive_event_listeners_with_toggled_interactivity,
27
+ show_hop_slider,
28
+ toggle_visible_component,
29
+ update_cached_input_songs,
30
+ update_output_audio,
31
+ update_song_cover_name,
32
+ update_value,
33
+ )
34
+
35
+
36
+ def _update_audio(
37
+ num_components: int, output_indices: list[int], file_path: str
38
+ ) -> gr.Audio | tuple[gr.Audio, ...]:
39
+ """
40
+ Update the value of a subset of `Audio` components to the given audio file path.
41
+
42
+ Parameters
43
+ ----------
44
+ num_components : int
45
+ The total number of `Audio` components under consideration.
46
+ output_indices : list[int]
47
+ Indices of `Audio` components to update the value for.
48
+ file_path : str
49
+ Path pointing to an audio track to update the indexed `Audio` components with.
50
+
51
+ Returns
52
+ -------
53
+ gr.Audio | tuple[gr.Audio, ...]
54
+ Each `Audio` component under consideration
55
+ with indexed components updated to the given audio file path.
56
+ """
57
+ update_args: list[TransferUpdateArgs] = [{} for _ in range(num_components)]
58
+ for index in output_indices:
59
+ update_args[index]["value"] = file_path
60
+ if num_components == 1:
61
+ return gr.Audio(**update_args[0])
62
+ return tuple(gr.Audio(**update_arg) for update_arg in update_args)
63
+
64
+
65
+ def render(
66
+ generate_buttons: list[gr.Button],
67
+ song_dir_dropdowns: list[gr.Dropdown],
68
+ cached_input_songs_dropdown_1click: gr.Dropdown,
69
+ cached_input_songs_dropdown_multi: gr.Dropdown,
70
+ rvc_model: gr.Dropdown,
71
+ intermediate_audio_to_delete: gr.Dropdown,
72
+ output_audio_to_remove: gr.Dropdown,
73
+ ) -> None:
74
+ """
75
+ Render "Multi-step generation" tab.
76
+
77
+ Parameters
78
+ ----------
79
+ generate_buttons : list[gr.Button]
80
+ Buttons used for audio generation in the
81
+ "One-click generation" tab and the "Multi-step generation" tab.
82
+ song_dir_dropdowns : list[gr.Dropdown]
83
+ Dropdowns for selecting song directories in the
84
+ "Multi-step generation" tab.
85
+ cached_input_songs_dropdown_1click : gr.Dropdown
86
+ Dropdown for selecting cached input songs in the
87
+ "One-click generation" tab.
88
+ cached_input_songs_dropdown_multi : gr.Dropdown
89
+ Dropdown for selecting cached input songs in the
90
+ "Multi-step generation" tab.
91
+ rvc_model : gr.Dropdown
92
+ Dropdown for selecting voice models in the
93
+ "Multi-step generation" tab.
94
+ intermediate_audio_to_delete : gr.Dropdown
95
+ Dropdown for selecting intermediate audio files to delete in the
96
+ "Delete audio" tab.
97
+ output_audio_to_remove : gr.Dropdown
98
+ Dropdown for selecting output audio files to delete in the
99
+ "Delete audio" tab.
100
+ """
101
+ with gr.Tab("Multi-step generation"):
102
+ (
103
+ retrieve_song_btn,
104
+ separate_vocals_btn,
105
+ separate_main_vocals_btn,
106
+ dereverb_vocals_btn,
107
+ convert_vocals_btn,
108
+ postprocess_vocals_btn,
109
+ pitch_shift_background_btn,
110
+ mix_btn,
111
+ _,
112
+ ) = generate_buttons
113
+ (
114
+ separate_vocals_dir,
115
+ separate_main_vocals_dir,
116
+ dereverb_vocals_dir,
117
+ convert_vocals_dir,
118
+ postprocess_vocals_dir,
119
+ pitch_shift_background_dir,
120
+ mix_dir,
121
+ ) = song_dir_dropdowns
122
+ current_song_dir = gr.State(None)
123
+
124
+ (
125
+ original_track_output,
126
+ vocals_track_output,
127
+ instrumentals_track_output,
128
+ main_vocals_track_output,
129
+ backup_vocals_track_output,
130
+ dereverbed_vocals_track_output,
131
+ reverb_track_output,
132
+ converted_vocals_track_output,
133
+ postprocessed_vocals_track_output,
134
+ shifted_instrumentals_track_output,
135
+ shifted_backup_vocals_track_output,
136
+ song_cover_track,
137
+ ) = [
138
+ gr.Audio(label=label, type="filepath", interactive=False, render=False)
139
+ for label in [
140
+ "Input song",
141
+ "Vocals",
142
+ "Instrumentals",
143
+ "Main vocals",
144
+ "Backup vocals",
145
+ "De-reverbed vocals",
146
+ "Reverb",
147
+ "Converted vocals",
148
+ "Post-processed vocals",
149
+ "Pitch-shifted instrumentals",
150
+ "Pitch-shifted backup vocals",
151
+ "Song cover",
152
+ ]
153
+ ]
154
+ input_tracks = [
155
+ gr.Audio(label=label, type="filepath", render=False)
156
+ for label in [
157
+ "Input song",
158
+ "Vocals",
159
+ "Vocals",
160
+ "Vocals",
161
+ "Vocals",
162
+ "Instrumentals",
163
+ "Backup vocals",
164
+ "Main vocals",
165
+ "Instrumentals",
166
+ "Backup vocals",
167
+ ]
168
+ ]
169
+ (
170
+ original_track_input,
171
+ vocals_track_input,
172
+ main_vocals_track_input,
173
+ dereverbed_vocals_track_input,
174
+ converted_vocals_track_input,
175
+ instrumentals_track_input,
176
+ backup_vocals_track_input,
177
+ postprocessed_vocals_track_input,
178
+ shifted_instrumentals_track_input,
179
+ shifted_backup_vocals_track_input,
180
+ ) = input_tracks
181
+
182
+ transfer_defaults = [
183
+ ["Step 1: input song"],
184
+ ["Step 2: vocals"],
185
+ ["Step 6: instrumentals"],
186
+ ["Step 3: vocals"],
187
+ ["Step 6: backup vocals"],
188
+ ["Step 4: vocals"],
189
+ [],
190
+ ["Step 5: vocals"],
191
+ ["Step 7: main vocals"],
192
+ ["Step 7: instrumentals"],
193
+ ["Step 7: backup vocals"],
194
+ [],
195
+ ]
196
+
197
+ (
198
+ original_track_transfer_default,
199
+ vocals_track_transfer_default,
200
+ instrumentals_track_transfer_default,
201
+ main_vocals_track_transfer_default,
202
+ backup_vocals_track_transfer_default,
203
+ dereverbed_vocals_track_transfer_default,
204
+ reverb_track_transfer_default,
205
+ converted_vocals_track_transfer_default,
206
+ postprocessed_vocals_track_transfer_default,
207
+ shifted_instrumentals_track_transfer_default,
208
+ shifted_backup_vocals_track_transfer_default,
209
+ song_cover_track_transfer_default,
210
+ ) = transfer_defaults
211
+
212
+ transfer_output_track_dropdowns = [
213
+ gr.Dropdown(
214
+ [
215
+ "Step 1: input song",
216
+ "Step 2: vocals",
217
+ "Step 3: vocals",
218
+ "Step 4: vocals",
219
+ "Step 5: vocals",
220
+ "Step 6: instrumentals",
221
+ "Step 6: backup vocals",
222
+ "Step 7: main vocals",
223
+ "Step 7: instrumentals",
224
+ "Step 7: backup vocals",
225
+ ],
226
+ label="Transfer to",
227
+ info=(
228
+ "Select the input track(s) to transfer the output track to once"
229
+ " generation completes."
230
+ ),
231
+ render=False,
232
+ type="index",
233
+ multiselect=True,
234
+ value=value,
235
+ )
236
+ for value in transfer_defaults
237
+ ]
238
+
239
+ (
240
+ original_track_transfer_dropdown,
241
+ vocals_track_transfer_dropdown,
242
+ instrumentals_track_transfer_dropdown,
243
+ main_vocals_track_transfer_dropdown,
244
+ backup_vocals_track_transfer_dropdown,
245
+ dereverbed_vocals_track_transfer_dropdown,
246
+ reverb_track_transfer_dropdown,
247
+ converted_vocals_track_transfer_dropdown,
248
+ postprocessed_vocals_track_transfer_dropdown,
249
+ shifted_instrumentals_track_transfer_dropdown,
250
+ shifted_backup_vocals_track_transfer_dropdown,
251
+ song_cover_track_transfer_dropdown,
252
+ ) = transfer_output_track_dropdowns
253
+
254
+ clear_btns = [gr.Button(value="Reset settings", render=False) for _ in range(8)]
255
+ (
256
+ retrieve_song_clear_btn,
257
+ separate_vocals_clear_btn,
258
+ separate_main_vocals_clear_btn,
259
+ dereverb_vocals_clear_btn,
260
+ convert_vocals_clear_btn,
261
+ postprocess_vocals_clear_btn,
262
+ pitch_shift_background_clear_btn,
263
+ mix_clear_btn,
264
+ ) = clear_btns
265
+
266
+ with gr.Accordion("Step 0: song retrieval", open=True):
267
+ gr.Markdown("")
268
+ gr.Markdown("**Inputs**")
269
+ with gr.Row():
270
+ with gr.Column():
271
+ song_input_type_dropdown = gr.Dropdown(
272
+ [
273
+ "YouTube link/local path",
274
+ "Local file/microphone",
275
+ "Cached song",
276
+ ],
277
+ value="YouTube link/local path",
278
+ label="Song input type",
279
+ type="index",
280
+ )
281
+ with gr.Column():
282
+ song_input = gr.Textbox(
283
+ label="Song input",
284
+ info=(
285
+ "Link to a song on YouTube or the full path of a local"
286
+ " audio file."
287
+ ),
288
+ )
289
+ local_file = gr.Audio(
290
+ label="Song input", type="filepath", visible=False
291
+ )
292
+ cached_input_songs_dropdown_multi.render()
293
+
294
+ song_input_type_dropdown.input(
295
+ partial(toggle_visible_component, 3),
296
+ inputs=song_input_type_dropdown,
297
+ outputs=[song_input, local_file, cached_input_songs_dropdown_multi],
298
+ show_progress="hidden",
299
+ )
300
+
301
+ local_file.change(
302
+ update_value,
303
+ inputs=local_file,
304
+ outputs=song_input,
305
+ show_progress="hidden",
306
+ )
307
+ cached_input_songs_dropdown_multi.input(
308
+ update_value,
309
+ inputs=cached_input_songs_dropdown_multi,
310
+ outputs=song_input,
311
+ show_progress="hidden",
312
+ )
313
+ gr.Markdown("**Outputs**")
314
+ original_track_output.render()
315
+ original_track_transfer_dropdown.render()
316
+ retrieve_song_clear_btn.render()
317
+ retrieve_song_clear_btn.click(
318
+ lambda: gr.Dropdown(value=original_track_transfer_default),
319
+ outputs=[original_track_transfer_dropdown],
320
+ show_progress="hidden",
321
+ )
322
+
323
+ retrieve_song_btn.render()
324
+
325
+ retrieve_song_event_args_list = [
326
+ EventArgs(
327
+ partial(
328
+ exception_harness(retrieve_song), progress_bar=PROGRESS_BAR
329
+ ),
330
+ inputs=[song_input],
331
+ outputs=[original_track_output, current_song_dir],
332
+ ),
333
+ EventArgs(
334
+ partial(
335
+ update_cached_input_songs,
336
+ len(song_dir_dropdowns) + 2,
337
+ value_indices=range(len(song_dir_dropdowns) + 1),
338
+ ),
339
+ inputs=[current_song_dir],
340
+ outputs=(
341
+ song_dir_dropdowns
342
+ + [
343
+ cached_input_songs_dropdown_multi,
344
+ cached_input_songs_dropdown_1click,
345
+ ]
346
+ ),
347
+ name="then",
348
+ show_progress="hidden",
349
+ ),
350
+ EventArgs(
351
+ partial(update_cached_input_songs, 1, [], [0]),
352
+ outputs=[intermediate_audio_to_delete],
353
+ name="then",
354
+ show_progress="hidden",
355
+ ),
356
+ EventArgs(
357
+ partial(_update_audio, len(input_tracks)),
358
+ inputs=[original_track_transfer_dropdown, original_track_output],
359
+ outputs=input_tracks,
360
+ name="then",
361
+ show_progress="hidden",
362
+ ),
363
+ ]
364
+ setup_consecutive_event_listeners_with_toggled_interactivity(
365
+ retrieve_song_btn,
366
+ retrieve_song_event_args_list,
367
+ generate_buttons,
368
+ )
369
+ with gr.Accordion("Step 1: vocals/instrumentals separation", open=False):
370
+ gr.Markdown("")
371
+ gr.Markdown("**Inputs**")
372
+ original_track_input.render()
373
+ separate_vocals_dir.render()
374
+ gr.Markdown("**Outputs**")
375
+ with gr.Row():
376
+ with gr.Column():
377
+ vocals_track_output.render()
378
+ vocals_track_transfer_dropdown.render()
379
+
380
+ with gr.Column():
381
+ instrumentals_track_output.render()
382
+ instrumentals_track_transfer_dropdown.render()
383
+
384
+ separate_vocals_clear_btn.render()
385
+ separate_vocals_clear_btn.click(
386
+ lambda: tuple(
387
+ gr.Dropdown(value=value)
388
+ for value in [
389
+ vocals_track_transfer_default,
390
+ instrumentals_track_transfer_default,
391
+ ]
392
+ ),
393
+ outputs=[
394
+ vocals_track_transfer_dropdown,
395
+ instrumentals_track_transfer_dropdown,
396
+ ],
397
+ show_progress="hidden",
398
+ )
399
+ separate_vocals_btn.render()
400
+
401
+ separate_vocals_event_args_list = [
402
+ EventArgs(
403
+ partial(
404
+ exception_harness(separate_vocals), progress_bar=PROGRESS_BAR
405
+ ),
406
+ inputs=[original_track_input, separate_vocals_dir],
407
+ outputs=[vocals_track_output, instrumentals_track_output],
408
+ )
409
+ ] + [
410
+ EventArgs(
411
+ partial(_update_audio, len(input_tracks)),
412
+ inputs=[transfer_dropdown, output_track],
413
+ outputs=input_tracks,
414
+ name="then",
415
+ show_progress="hidden",
416
+ )
417
+ for transfer_dropdown, output_track in zip(
418
+ [
419
+ vocals_track_transfer_dropdown,
420
+ instrumentals_track_transfer_dropdown,
421
+ ],
422
+ [vocals_track_output, instrumentals_track_output],
423
+ )
424
+ ]
425
+ setup_consecutive_event_listeners_with_toggled_interactivity(
426
+ separate_vocals_btn,
427
+ separate_vocals_event_args_list,
428
+ generate_buttons,
429
+ )
430
+
431
+ with gr.Accordion("Step 2: main vocals/ backup vocals separation", open=False):
432
+ gr.Markdown("")
433
+ gr.Markdown("**Inputs**")
434
+ vocals_track_input.render()
435
+ separate_main_vocals_dir.render()
436
+ gr.Markdown("**Outputs**")
437
+ with gr.Row():
438
+ with gr.Column():
439
+ main_vocals_track_output.render()
440
+ main_vocals_track_transfer_dropdown.render()
441
+ with gr.Column():
442
+ backup_vocals_track_output.render()
443
+ backup_vocals_track_transfer_dropdown.render()
444
+
445
+ separate_main_vocals_clear_btn.render()
446
+ separate_main_vocals_clear_btn.click(
447
+ lambda: tuple(
448
+ gr.Dropdown(value=value)
449
+ for value in [
450
+ main_vocals_track_transfer_default,
451
+ backup_vocals_track_transfer_default,
452
+ ]
453
+ ),
454
+ outputs=[
455
+ main_vocals_track_transfer_dropdown,
456
+ backup_vocals_track_transfer_dropdown,
457
+ ],
458
+ show_progress="hidden",
459
+ )
460
+ separate_main_vocals_btn.render()
461
+
462
+ separate_main_vocals_event_args_list = [
463
+ EventArgs(
464
+ partial(
465
+ exception_harness(separate_main_vocals),
466
+ progress_bar=PROGRESS_BAR,
467
+ ),
468
+ inputs=[vocals_track_input, separate_main_vocals_dir],
469
+ outputs=[main_vocals_track_output, backup_vocals_track_output],
470
+ )
471
+ ] + [
472
+ EventArgs(
473
+ partial(_update_audio, len(input_tracks)),
474
+ inputs=[transfer_dropdown, output_track],
475
+ outputs=input_tracks,
476
+ name="then",
477
+ show_progress="hidden",
478
+ )
479
+ for transfer_dropdown, output_track in zip(
480
+ [
481
+ main_vocals_track_transfer_dropdown,
482
+ backup_vocals_track_transfer_dropdown,
483
+ ],
484
+ [main_vocals_track_output, backup_vocals_track_output],
485
+ )
486
+ ]
487
+
488
+ setup_consecutive_event_listeners_with_toggled_interactivity(
489
+ separate_main_vocals_btn,
490
+ separate_main_vocals_event_args_list,
491
+ generate_buttons,
492
+ )
493
+
494
+ with gr.Accordion("Step 3: vocal cleanup", open=False):
495
+ gr.Markdown("")
496
+ gr.Markdown("**Inputs**")
497
+ main_vocals_track_input.render()
498
+ dereverb_vocals_dir.render()
499
+ gr.Markdown("**Outputs**")
500
+ with gr.Row():
501
+ with gr.Column():
502
+ dereverbed_vocals_track_output.render()
503
+ dereverbed_vocals_track_transfer_dropdown.render()
504
+ with gr.Column():
505
+ reverb_track_output.render()
506
+ reverb_track_transfer_dropdown.render()
507
+
508
+ dereverb_vocals_clear_btn.render()
509
+ dereverb_vocals_clear_btn.click(
510
+ lambda: tuple(
511
+ gr.Dropdown(value=value)
512
+ for value in [
513
+ dereverbed_vocals_track_transfer_default,
514
+ reverb_track_transfer_default,
515
+ ]
516
+ ),
517
+ outputs=[
518
+ dereverbed_vocals_track_transfer_dropdown,
519
+ reverb_track_transfer_dropdown,
520
+ ],
521
+ show_progress="hidden",
522
+ )
523
+ dereverb_vocals_btn.render()
524
+ dereverb_vocals_event_args_list = [
525
+ EventArgs(
526
+ partial(
527
+ exception_harness(dereverb_vocals), progress_bar=PROGRESS_BAR
528
+ ),
529
+ inputs=[main_vocals_track_input, dereverb_vocals_dir],
530
+ outputs=[dereverbed_vocals_track_output, reverb_track_output],
531
+ )
532
+ ] + [
533
+ EventArgs(
534
+ partial(_update_audio, len(input_tracks)),
535
+ inputs=[transfer_dropdown, output_track],
536
+ outputs=input_tracks,
537
+ name="then",
538
+ show_progress="hidden",
539
+ )
540
+ for transfer_dropdown, output_track in zip(
541
+ [
542
+ dereverbed_vocals_track_transfer_dropdown,
543
+ reverb_track_transfer_dropdown,
544
+ ],
545
+ [dereverbed_vocals_track_output, reverb_track_output],
546
+ )
547
+ ]
548
+
549
+ setup_consecutive_event_listeners_with_toggled_interactivity(
550
+ dereverb_vocals_btn, dereverb_vocals_event_args_list, generate_buttons
551
+ )
552
+ with gr.Accordion("Step 4: vocal conversion", open=False):
553
+ gr.Markdown("")
554
+ gr.Markdown("**Inputs**")
555
+ dereverbed_vocals_track_input.render()
556
+ convert_vocals_dir.render()
557
+ with gr.Row():
558
+ rvc_model.render()
559
+ pitch_change_octaves = gr.Slider(
560
+ -3,
561
+ 3,
562
+ value=0,
563
+ step=1,
564
+ label="Pitch shift (octaves)",
565
+ info=(
566
+ "Shift pitch of converted vocals by number of octaves."
567
+ " Generally, use 1 for male-to-female conversions and -1 for"
568
+ " vice-versa."
569
+ ),
570
+ )
571
+ pitch_change_semitones = gr.Slider(
572
+ -12,
573
+ 12,
574
+ value=0,
575
+ step=1,
576
+ label="Pitch shift (semi-tones)",
577
+ info=(
578
+ "Shift pitch of converted vocals by number of semi-tones."
579
+ " Altering this slightly reduces sound quality."
580
+ ),
581
+ )
582
+ with gr.Row():
583
+ index_rate = gr.Slider(
584
+ 0,
585
+ 1,
586
+ value=0.5,
587
+ label="Index rate",
588
+ info=(
589
+ "Controls how much of the accent in the voice model to keep in"
590
+ " the converted vocals"
591
+ ),
592
+ )
593
+ filter_radius = gr.Slider(
594
+ 0,
595
+ 7,
596
+ value=3,
597
+ step=1,
598
+ label="Filter radius",
599
+ info=(
600
+ "If >=3: apply median filtering to the harvested pitch results."
601
+ " Can reduce breathiness"
602
+ ),
603
+ )
604
+ rms_mix_rate = gr.Slider(
605
+ 0,
606
+ 1,
607
+ value=0.25,
608
+ label="RMS mix rate",
609
+ info=(
610
+ "Control how much to mimic the loudness (0) of the input vocals"
611
+ " or a fixed loudness (1)"
612
+ ),
613
+ )
614
+ protect = gr.Slider(
615
+ 0,
616
+ 0.5,
617
+ value=0.33,
618
+ label="Protect rate",
619
+ info=(
620
+ "Protect voiceless consonants and breath sounds. Set to 0.5 to"
621
+ " disable."
622
+ ),
623
+ )
624
+ with gr.Column():
625
+ f0_method = gr.Dropdown(
626
+ ["rmvpe", "mangio-crepe"],
627
+ value="rmvpe",
628
+ label="Pitch detection algorithm",
629
+ info=(
630
+ "Best option is rmvpe (clarity in vocals), then"
631
+ " mangio-crepe (smoother vocals)"
632
+ ),
633
+ )
634
+ crepe_hop_length = gr.Slider(
635
+ 32,
636
+ 320,
637
+ value=128,
638
+ step=1,
639
+ visible=False,
640
+ label="Crepe hop length",
641
+ info=(
642
+ "Lower values leads to longer conversions and higher risk"
643
+ " of voice cracks, but better pitch accuracy."
644
+ ),
645
+ )
646
+ f0_method.change(
647
+ show_hop_slider,
648
+ inputs=f0_method,
649
+ outputs=crepe_hop_length,
650
+ show_progress="hidden",
651
+ )
652
+
653
+ gr.Markdown("**Outputs**")
654
+ converted_vocals_track_output.render()
655
+ converted_vocals_track_transfer_dropdown.render()
656
+ convert_vocals_clear_btn.render()
657
+ convert_vocals_clear_btn.click(
658
+ lambda: [
659
+ 0,
660
+ 0,
661
+ 0.5,
662
+ 3,
663
+ 0.25,
664
+ 0.33,
665
+ "rmvpe",
666
+ 128,
667
+ gr.Dropdown(value=converted_vocals_track_transfer_default),
668
+ ],
669
+ outputs=[
670
+ pitch_change_octaves,
671
+ pitch_change_semitones,
672
+ index_rate,
673
+ filter_radius,
674
+ rms_mix_rate,
675
+ protect,
676
+ f0_method,
677
+ crepe_hop_length,
678
+ converted_vocals_track_transfer_dropdown,
679
+ ],
680
+ show_progress="hidden",
681
+ )
682
+ convert_vocals_btn.render()
683
+ convert_vocals_event_args_list = [
684
+ EventArgs(
685
+ partial(
686
+ exception_harness(convert_vocals), progress_bar=PROGRESS_BAR
687
+ ),
688
+ inputs=[
689
+ dereverbed_vocals_track_input,
690
+ convert_vocals_dir,
691
+ rvc_model,
692
+ pitch_change_octaves,
693
+ pitch_change_semitones,
694
+ index_rate,
695
+ filter_radius,
696
+ rms_mix_rate,
697
+ protect,
698
+ f0_method,
699
+ crepe_hop_length,
700
+ ],
701
+ outputs=[converted_vocals_track_output],
702
+ ),
703
+ EventArgs(
704
+ partial(_update_audio, len(input_tracks)),
705
+ inputs=[
706
+ converted_vocals_track_transfer_dropdown,
707
+ converted_vocals_track_output,
708
+ ],
709
+ outputs=input_tracks,
710
+ name="then",
711
+ show_progress="hidden",
712
+ ),
713
+ ]
714
+ setup_consecutive_event_listeners_with_toggled_interactivity(
715
+ convert_vocals_btn, convert_vocals_event_args_list, generate_buttons
716
+ )
717
+ with gr.Accordion("Step 5: post-processing of vocals", open=False):
718
+ gr.Markdown("")
719
+ gr.Markdown("**Inputs**")
720
+ converted_vocals_track_input.render()
721
+ postprocess_vocals_dir.render()
722
+ with gr.Row():
723
+ reverb_rm_size = gr.Slider(
724
+ 0,
725
+ 1,
726
+ value=0.15,
727
+ label="Room size",
728
+ info="The larger the room, the longer the reverb time",
729
+ )
730
+ reverb_wet = gr.Slider(
731
+ 0,
732
+ 1,
733
+ value=0.2,
734
+ label="Wetness level",
735
+ info="Loudness level of converted vocals with reverb",
736
+ )
737
+ reverb_dry = gr.Slider(
738
+ 0,
739
+ 1,
740
+ value=0.8,
741
+ label="Dryness level",
742
+ info="Loudness level of converted vocals without reverb",
743
+ )
744
+ reverb_damping = gr.Slider(
745
+ 0,
746
+ 1,
747
+ value=0.7,
748
+ label="Damping level",
749
+ info="Absorption of high frequencies in the reverb",
750
+ )
751
+ gr.Markdown("**Outputs**")
752
+
753
+ postprocessed_vocals_track_output.render()
754
+ postprocessed_vocals_track_transfer_dropdown.render()
755
+
756
+ postprocess_vocals_clear_btn.render()
757
+ postprocess_vocals_clear_btn.click(
758
+ lambda: [
759
+ 0.15,
760
+ 0.2,
761
+ 0.8,
762
+ 0.7,
763
+ gr.Dropdown(value=postprocessed_vocals_track_transfer_default),
764
+ ],
765
+ outputs=[
766
+ reverb_rm_size,
767
+ reverb_wet,
768
+ reverb_dry,
769
+ reverb_damping,
770
+ postprocessed_vocals_track_transfer_dropdown,
771
+ ],
772
+ show_progress="hidden",
773
+ )
774
+ postprocess_vocals_btn.render()
775
+ postprocess_vocals_event_args_list = [
776
+ EventArgs(
777
+ partial(
778
+ exception_harness(postprocess_vocals),
779
+ progress_bar=PROGRESS_BAR,
780
+ ),
781
+ inputs=[
782
+ converted_vocals_track_input,
783
+ postprocess_vocals_dir,
784
+ reverb_rm_size,
785
+ reverb_wet,
786
+ reverb_dry,
787
+ reverb_damping,
788
+ ],
789
+ outputs=[postprocessed_vocals_track_output],
790
+ ),
791
+ EventArgs(
792
+ partial(_update_audio, len(input_tracks)),
793
+ inputs=[
794
+ postprocessed_vocals_track_transfer_dropdown,
795
+ postprocessed_vocals_track_output,
796
+ ],
797
+ outputs=input_tracks,
798
+ name="then",
799
+ show_progress="hidden",
800
+ ),
801
+ ]
802
+ setup_consecutive_event_listeners_with_toggled_interactivity(
803
+ postprocess_vocals_btn,
804
+ postprocess_vocals_event_args_list,
805
+ generate_buttons,
806
+ )
807
+ with gr.Accordion("Step 6: pitch shift of background tracks", open=False):
808
+ gr.Markdown("")
809
+ gr.Markdown("**Inputs**")
810
+ with gr.Row():
811
+ instrumentals_track_input.render()
812
+ backup_vocals_track_input.render()
813
+ pitch_shift_background_dir.render()
814
+ pitch_change_semitones_background = gr.Slider(
815
+ -12,
816
+ 12,
817
+ value=0,
818
+ step=1,
819
+ label="Pitch shift",
820
+ info=(
821
+ "Shift pitch of instrumentals and backup vocals. Measured in"
822
+ " semi-tones."
823
+ ),
824
+ )
825
+ gr.Markdown("**Outputs**")
826
+ with gr.Row():
827
+ with gr.Column():
828
+ shifted_instrumentals_track_output.render()
829
+ shifted_instrumentals_track_transfer_dropdown.render()
830
+ with gr.Column():
831
+ shifted_backup_vocals_track_output.render()
832
+ shifted_backup_vocals_track_transfer_dropdown.render()
833
+
834
+ pitch_shift_background_clear_btn.render()
835
+ pitch_shift_background_clear_btn.click(
836
+ lambda: [
837
+ 0,
838
+ gr.Dropdown(value=shifted_instrumentals_track_transfer_default),
839
+ gr.Dropdown(value=shifted_backup_vocals_track_transfer_default),
840
+ ],
841
+ outputs=[
842
+ pitch_change_semitones_background,
843
+ shifted_instrumentals_track_transfer_dropdown,
844
+ shifted_backup_vocals_track_transfer_dropdown,
845
+ ],
846
+ show_progress="hidden",
847
+ )
848
+ pitch_shift_background_btn.render()
849
+ pitch_shift_background_event_args_list = [
850
+ EventArgs(
851
+ partial(
852
+ exception_harness(pitch_shift_background),
853
+ progress_bar=PROGRESS_BAR,
854
+ ),
855
+ inputs=[
856
+ instrumentals_track_input,
857
+ backup_vocals_track_input,
858
+ pitch_shift_background_dir,
859
+ pitch_change_semitones_background,
860
+ ],
861
+ outputs=[
862
+ shifted_instrumentals_track_output,
863
+ shifted_backup_vocals_track_output,
864
+ ],
865
+ )
866
+ ] + [
867
+ EventArgs(
868
+ partial(_update_audio, len(input_tracks)),
869
+ inputs=[dropdown, output_track],
870
+ outputs=input_tracks,
871
+ name="then",
872
+ show_progress="hidden",
873
+ )
874
+ for dropdown, output_track in zip(
875
+ [
876
+ shifted_instrumentals_track_transfer_dropdown,
877
+ shifted_backup_vocals_track_transfer_dropdown,
878
+ ],
879
+ [
880
+ shifted_instrumentals_track_output,
881
+ shifted_backup_vocals_track_output,
882
+ ],
883
+ )
884
+ ]
885
+
886
+ setup_consecutive_event_listeners_with_toggled_interactivity(
887
+ pitch_shift_background_btn,
888
+ pitch_shift_background_event_args_list,
889
+ generate_buttons,
890
+ )
891
+ with gr.Accordion("Step 7: song mixing", open=False):
892
+ gr.Markdown("")
893
+ gr.Markdown("**Inputs**")
894
+ with gr.Row():
895
+ postprocessed_vocals_track_input.render()
896
+ shifted_instrumentals_track_input.render()
897
+ shifted_backup_vocals_track_input.render()
898
+ mix_dir.render()
899
+ with gr.Row():
900
+ main_gain = gr.Slider(-20, 20, value=0, step=1, label="Main vocals")
901
+ inst_gain = gr.Slider(-20, 20, value=0, step=1, label="Instrumentals")
902
+ backup_gain = gr.Slider(-20, 20, value=0, step=1, label="Backup vocals")
903
+ with gr.Row():
904
+ output_name = gr.Textbox(
905
+ label="Output file name",
906
+ placeholder="Ultimate RVC song cover",
907
+ )
908
+ output_sr = gr.Dropdown(
909
+ choices=[16000, 44100, 48000, 96000, 192000],
910
+ value=44100,
911
+ label="Output sample rate",
912
+ )
913
+ output_format = gr.Dropdown(
914
+ ["mp3", "wav", "flac", "aac", "m4a", "ogg"],
915
+ value="mp3",
916
+ label="Output file format",
917
+ )
918
+ postprocessed_vocals_track_input.change(
919
+ update_song_cover_name,
920
+ inputs=[postprocessed_vocals_track_input, mix_dir],
921
+ outputs=output_name,
922
+ show_progress="hidden",
923
+ )
924
+ mix_dir.change(
925
+ update_song_cover_name,
926
+ inputs=[postprocessed_vocals_track_input, mix_dir],
927
+ outputs=output_name,
928
+ show_progress="hidden",
929
+ )
930
+
931
+ gr.Markdown("**Outputs**")
932
+ song_cover_track.render()
933
+ song_cover_track_transfer_dropdown.render()
934
+ mix_clear_btn.render()
935
+ mix_clear_btn.click(
936
+ lambda: [
937
+ 0,
938
+ 0,
939
+ 0,
940
+ 44100,
941
+ "mp3",
942
+ gr.Dropdown(value=song_cover_track_transfer_default),
943
+ ],
944
+ outputs=[
945
+ main_gain,
946
+ inst_gain,
947
+ backup_gain,
948
+ output_sr,
949
+ output_format,
950
+ song_cover_track_transfer_dropdown,
951
+ ],
952
+ show_progress="hidden",
953
+ )
954
+ mix_btn.render()
955
+ mix_btn_event_args_list = [
956
+ EventArgs(
957
+ partial(
958
+ exception_harness(mix_song_cover), progress_bar=PROGRESS_BAR
959
+ ),
960
+ inputs=[
961
+ postprocessed_vocals_track_input,
962
+ shifted_instrumentals_track_input,
963
+ shifted_backup_vocals_track_input,
964
+ mix_dir,
965
+ main_gain,
966
+ inst_gain,
967
+ backup_gain,
968
+ output_sr,
969
+ output_format,
970
+ output_name,
971
+ ],
972
+ outputs=[song_cover_track],
973
+ ),
974
+ EventArgs(
975
+ partial(update_output_audio, 1, [], [0]),
976
+ outputs=[output_audio_to_remove],
977
+ name="then",
978
+ show_progress="hidden",
979
+ ),
980
+ EventArgs(
981
+ partial(_update_audio, len(input_tracks)),
982
+ inputs=[song_cover_track_transfer_dropdown, song_cover_track],
983
+ outputs=input_tracks,
984
+ name="then",
985
+ show_progress="hidden",
986
+ ),
987
+ ]
988
+
989
+ setup_consecutive_event_listeners_with_toggled_interactivity(
990
+ mix_btn, mix_btn_event_args_list, generate_buttons
991
+ )
src/frontend/tabs/one_click_generation.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the code for the "One-click generation" tab.
3
+ """
4
+
5
+ from typings.extra import RunPipelineHarnessArgs
6
+
7
+ from functools import partial
8
+
9
+ import gradio as gr
10
+
11
+ from backend.generate_song_cover import run_pipeline
12
+
13
+ from frontend.common import (
14
+ PROGRESS_BAR,
15
+ EventArgs,
16
+ exception_harness,
17
+ setup_consecutive_event_listeners_with_toggled_interactivity,
18
+ show_hop_slider,
19
+ toggle_visible_component,
20
+ update_cached_input_songs,
21
+ update_output_audio,
22
+ update_song_cover_name,
23
+ update_value,
24
+ )
25
+
26
+
27
+ def _run_pipeline_harness(*args: *RunPipelineHarnessArgs) -> tuple[str | None, ...]:
28
+ """
29
+ Run the song cover generation pipeline in a harness
30
+ which displays a progress bar, re-raises exceptions as Gradio errors,
31
+ and returns the output of the pipeline.
32
+
33
+ If the pipeline outputs only a single path,
34
+ then that output is extended with a None value for each intermediate audio file.
35
+
36
+ Parameters
37
+ ----------
38
+ *args : *RunPipelineHarnessArgs
39
+ Arguments to forward to the pipeline.
40
+
41
+ Returns
42
+ -------
43
+ tuple[str | None, ...]
44
+ The output of the pipeline, potentially extended with None values.
45
+ """
46
+
47
+ res = exception_harness(run_pipeline)(*args, progress_bar=PROGRESS_BAR)
48
+ if isinstance(res, tuple):
49
+ return res
50
+ else:
51
+ return (None,) * 11 + (res,)
52
+
53
+
54
+ def _toggle_intermediate_files_accordion(
55
+ visible: bool,
56
+ ) -> list[gr.Accordion | gr.Audio]:
57
+ """
58
+ Toggle the visibility of intermediate audio file accordions
59
+ and their associated audio components.
60
+
61
+ Parameters
62
+ ----------
63
+ visible : bool
64
+ Visibility status of the accordions and audio components.
65
+
66
+ Returns
67
+ -------
68
+ list[gr.Accordion | gr.Audio]
69
+ The accordions and audio components with updated visibility.
70
+ """
71
+ audio_components = [gr.Audio(value=None) for _ in range(11)]
72
+ accordions = [gr.Accordion(open=False) for _ in range(7)]
73
+ return [gr.Accordion(visible=visible, open=False)] + accordions + audio_components
74
+
75
+
76
+ def render(
77
+ generate_buttons: list[gr.Button],
78
+ song_dir_dropdowns: list[gr.Dropdown],
79
+ cached_input_songs_dropdown_1click: gr.Dropdown,
80
+ cached_input_songs_dropdown_multi: gr.Dropdown,
81
+ rvc_model: gr.Dropdown,
82
+ intermediate_audio_to_delete: gr.Dropdown,
83
+ output_audio_to_delete: gr.Dropdown,
84
+ ) -> None:
85
+ """
86
+ Render "One-click generation" tab.
87
+
88
+ Parameters
89
+ ----------
90
+ generate_buttons : list[gr.Button]
91
+ Buttons used for audio generation in the
92
+ "One-click generation" tab and the "Multi-step generation" tab.
93
+ song_dir_dropdowns : list[gr.Dropdown]
94
+ Dropdowns for selecting song directories in the
95
+ "Multi-step generation" tab.
96
+ cached_input_songs_dropdown_1click : gr.Dropdown
97
+ Dropdown for selecting cached input songs in the
98
+ "One-click generation" tab
99
+ cached_input_songs_dropdown_multi : gr.Dropdown
100
+ Dropdown for selecting cached input songs in the
101
+ "Multi-step generation" tab
102
+ rvc_model : gr.Dropdown
103
+ Dropdown for selecting RVC model in the
104
+ "One-click generation" tab.
105
+ intermediate_audio_to_delete : gr.Dropdown
106
+ Dropdown for selecting intermediate audio files to delete in the
107
+ "Manage audio" tab.
108
+ output_audio_to_delete : gr.Dropdown
109
+ Dropdown for selecting output audio files to delete in the
110
+ "Manage audio" tab.
111
+ """
112
+
113
+ with gr.Tab("One-click generation"):
114
+ (
115
+ _,
116
+ _,
117
+ _,
118
+ _,
119
+ _,
120
+ _,
121
+ _,
122
+ _,
123
+ generate_btn,
124
+ ) = generate_buttons
125
+
126
+ with gr.Accordion("Main options"):
127
+ with gr.Row():
128
+ with gr.Column():
129
+ song_input_type_dropdown = gr.Dropdown(
130
+ ["YouTube link/local path", "Local file", "Cached song"],
131
+ value="YouTube link/local path",
132
+ label="Song input type",
133
+ type="index",
134
+ )
135
+ song_input = gr.Textbox(
136
+ label="Song input",
137
+ info=(
138
+ "Link to a song on YouTube or the full path of a local"
139
+ " audio file."
140
+ ),
141
+ )
142
+ local_file = gr.Audio(
143
+ label="Song input", type="filepath", visible=False
144
+ )
145
+ cached_input_songs_dropdown_1click.render()
146
+ song_input_type_dropdown.input(
147
+ partial(toggle_visible_component, 3),
148
+ inputs=song_input_type_dropdown,
149
+ outputs=[
150
+ song_input,
151
+ local_file,
152
+ cached_input_songs_dropdown_1click,
153
+ ],
154
+ show_progress="hidden",
155
+ )
156
+
157
+ local_file.change(
158
+ update_value,
159
+ inputs=local_file,
160
+ outputs=song_input,
161
+ show_progress="hidden",
162
+ )
163
+ cached_input_songs_dropdown_1click.input(
164
+ update_value,
165
+ inputs=cached_input_songs_dropdown_1click,
166
+ outputs=song_input,
167
+ show_progress="hidden",
168
+ )
169
+
170
+ with gr.Column():
171
+ rvc_model.render()
172
+
173
+ with gr.Column():
174
+ pitch_change_vocals = gr.Slider(
175
+ -3,
176
+ 3,
177
+ value=0,
178
+ step=1,
179
+ label="Pitch shift of vocals",
180
+ info=(
181
+ "Shift pitch of converted vocals. Measured in octaves."
182
+ " Generally, use 1 for male-to-female conversions and -1"
183
+ " for vice-versa."
184
+ ),
185
+ )
186
+ pitch_change_all = gr.Slider(
187
+ -12,
188
+ 12,
189
+ value=0,
190
+ step=1,
191
+ label="Overall pitch shift",
192
+ info=(
193
+ "Shift pitch of converted vocals, backup vocals and"
194
+ " instrumentals. Measured in semi-tones. Altering this"
195
+ " slightly reduces sound quality."
196
+ ),
197
+ )
198
+
199
+ with gr.Accordion("Vocal conversion options", open=False):
200
+ with gr.Row():
201
+ index_rate = gr.Slider(
202
+ 0,
203
+ 1,
204
+ value=0.5,
205
+ label="Index rate",
206
+ info=(
207
+ "Controls how much of the accent in the voice model to keep in"
208
+ " the converted vocals"
209
+ ),
210
+ )
211
+ filter_radius = gr.Slider(
212
+ 0,
213
+ 7,
214
+ value=3,
215
+ step=1,
216
+ label="Filter radius",
217
+ info=(
218
+ "If >=3: apply median filtering to the harvested pitch results."
219
+ " Can reduce breathiness"
220
+ ),
221
+ )
222
+ rms_mix_rate = gr.Slider(
223
+ 0,
224
+ 1,
225
+ value=0.25,
226
+ label="RMS mix rate",
227
+ info=(
228
+ "Control how much to mimic the loudness (0) of the input vocals"
229
+ " or a fixed loudness (1)"
230
+ ),
231
+ )
232
+ protect = gr.Slider(
233
+ 0,
234
+ 0.5,
235
+ value=0.33,
236
+ label="Protect rate",
237
+ info=(
238
+ "Protect voiceless consonants and breath sounds. Set to 0.5 to"
239
+ " disable."
240
+ ),
241
+ )
242
+ with gr.Column():
243
+ f0_method = gr.Dropdown(
244
+ ["rmvpe", "mangio-crepe"],
245
+ value="rmvpe",
246
+ label="Pitch detection algorithm",
247
+ info=(
248
+ "Best option is rmvpe (clarity in vocals), then"
249
+ " mangio-crepe (smoother vocals)"
250
+ ),
251
+ )
252
+ crepe_hop_length = gr.Slider(
253
+ 32,
254
+ 320,
255
+ value=128,
256
+ step=1,
257
+ visible=False,
258
+ label="Crepe hop length",
259
+ info=(
260
+ "Lower values leads to longer conversions and higher risk"
261
+ " of voice cracks, but better pitch accuracy."
262
+ ),
263
+ )
264
+ f0_method.change(
265
+ show_hop_slider,
266
+ inputs=f0_method,
267
+ outputs=crepe_hop_length,
268
+ show_progress="hidden",
269
+ )
270
+ with gr.Accordion("Audio mixing options", open=False):
271
+ gr.Markdown("")
272
+ gr.Markdown("### Reverb control on converted vocals")
273
+ with gr.Row():
274
+ reverb_rm_size = gr.Slider(
275
+ 0,
276
+ 1,
277
+ value=0.15,
278
+ label="Room size",
279
+ info="The larger the room, the longer the reverb time",
280
+ )
281
+ reverb_wet = gr.Slider(
282
+ 0,
283
+ 1,
284
+ value=0.2,
285
+ label="Wetness level",
286
+ info="Loudness level of converted vocals with reverb",
287
+ )
288
+ reverb_dry = gr.Slider(
289
+ 0,
290
+ 1,
291
+ value=0.8,
292
+ label="Dryness level",
293
+ info="Loudness level of converted vocals without reverb",
294
+ )
295
+ reverb_damping = gr.Slider(
296
+ 0,
297
+ 1,
298
+ value=0.7,
299
+ label="Damping level",
300
+ info="Absorption of high frequencies in the reverb",
301
+ )
302
+
303
+ gr.Markdown("")
304
+ gr.Markdown("### Volume controls (dB)")
305
+ with gr.Row():
306
+ main_gain = gr.Slider(-20, 20, value=0, step=1, label="Main vocals")
307
+ inst_gain = gr.Slider(-20, 20, value=0, step=1, label="Instrumentals")
308
+ backup_gain = gr.Slider(-20, 20, value=0, step=1, label="Backup vocals")
309
+ with gr.Accordion("Audio output options", open=False):
310
+ with gr.Row():
311
+ output_name = gr.Textbox(
312
+ label="Output file name",
313
+ info=(
314
+ "If no name is provided, a suitable name will be generated"
315
+ " automatically."
316
+ ),
317
+ placeholder="Ultimate RVC song cover",
318
+ )
319
+ output_sr = gr.Dropdown(
320
+ choices=[16000, 44100, 48000, 96000, 192000],
321
+ value=44100,
322
+ label="Output sample rate",
323
+ )
324
+ output_format = gr.Dropdown(
325
+ ["mp3", "wav", "flac", "aac", "m4a", "ogg"],
326
+ value="mp3",
327
+ label="Output file format",
328
+ )
329
+ with gr.Row():
330
+ show_intermediate_files = gr.Checkbox(
331
+ label="Show intermediate audio files",
332
+ value=False,
333
+ info=(
334
+ "Show generated intermediate audio files when song cover"
335
+ " generation completes. Leave unchecked to optimize"
336
+ " performance."
337
+ ),
338
+ )
339
+ rvc_model.change(
340
+ partial(update_song_cover_name, None, update_placeholder=True),
341
+ inputs=[cached_input_songs_dropdown_1click, rvc_model],
342
+ outputs=output_name,
343
+ show_progress="hidden",
344
+ )
345
+ cached_input_songs_dropdown_1click.change(
346
+ partial(update_song_cover_name, None, update_placeholder=True),
347
+ inputs=[cached_input_songs_dropdown_1click, rvc_model],
348
+ outputs=output_name,
349
+ show_progress="hidden",
350
+ )
351
+
352
+ intermediate_audio_accordions = [
353
+ gr.Accordion(label, open=False, render=False)
354
+ for label in [
355
+ "Step 0: song retrieval",
356
+ "Step 1: vocals/instrumentals separation",
357
+ "Step 2: main vocals/ backup vocals separation",
358
+ "Step 3: main vocals cleanup",
359
+ "Step 4: conversion of main vocals",
360
+ "Step 5: post-processing of converted vocals",
361
+ "Step 6: pitch shift of background tracks",
362
+ ]
363
+ ]
364
+ (
365
+ song_retrieval_accordion,
366
+ vocals_separation_accordion,
367
+ main_vocals_separation_accordion,
368
+ vocal_cleanup_accordion,
369
+ vocal_conversion_accordion,
370
+ vocals_postprocessing_accordion,
371
+ pitch_shift_accordion,
372
+ ) = intermediate_audio_accordions
373
+ (
374
+ original_track,
375
+ vocals_track,
376
+ instrumentals_track,
377
+ main_vocals_track,
378
+ backup_vocals_track,
379
+ main_vocals_dereverbed_track,
380
+ main_vocals_reverb_track,
381
+ converted_vocals_track,
382
+ postprocessed_vocals_track,
383
+ instrumentals_shifted_track,
384
+ backup_vocals_shifted_track,
385
+ ) = [
386
+ gr.Audio(label=label, type="filepath", interactive=False, render=False)
387
+ for label in [
388
+ "Input song",
389
+ "Vocals",
390
+ "Instrumentals",
391
+ "Main vocals",
392
+ "Backup vocals",
393
+ "De-reverbed main vocals",
394
+ "Main vocals reverb",
395
+ "Converted vocals",
396
+ "Post-processed vocals",
397
+ "Pitch-shifted instrumentals",
398
+ "Pitch-shifted backup vocals",
399
+ ]
400
+ ]
401
+ with gr.Accordion(
402
+ "Access intermediate audio files", open=False, visible=False
403
+ ) as intermediate_files_accordion:
404
+ song_retrieval_accordion.render()
405
+ with song_retrieval_accordion:
406
+ original_track.render()
407
+ vocals_separation_accordion.render()
408
+ with vocals_separation_accordion:
409
+ with gr.Row():
410
+ vocals_track.render()
411
+ instrumentals_track.render()
412
+ main_vocals_separation_accordion.render()
413
+ with main_vocals_separation_accordion:
414
+ with gr.Row():
415
+ main_vocals_track.render()
416
+ backup_vocals_track.render()
417
+
418
+ vocal_cleanup_accordion.render()
419
+ with vocal_cleanup_accordion:
420
+ with gr.Row():
421
+ main_vocals_dereverbed_track.render()
422
+ main_vocals_reverb_track.render()
423
+ vocal_conversion_accordion.render()
424
+ with vocal_conversion_accordion:
425
+ converted_vocals_track.render()
426
+ vocals_postprocessing_accordion.render()
427
+ with vocals_postprocessing_accordion:
428
+ postprocessed_vocals_track.render()
429
+ pitch_shift_accordion.render()
430
+ with pitch_shift_accordion:
431
+ with gr.Row():
432
+ instrumentals_shifted_track.render()
433
+ backup_vocals_shifted_track.render()
434
+
435
+ with gr.Row():
436
+ clear_btn = gr.Button(value="Reset settings", scale=2)
437
+ generate_btn.render()
438
+ song_cover_track = gr.Audio(label="Song cover", scale=3)
439
+ show_intermediate_files.change(
440
+ _toggle_intermediate_files_accordion,
441
+ inputs=show_intermediate_files,
442
+ outputs=[
443
+ intermediate_files_accordion,
444
+ song_retrieval_accordion,
445
+ vocals_separation_accordion,
446
+ main_vocals_separation_accordion,
447
+ vocal_cleanup_accordion,
448
+ vocal_conversion_accordion,
449
+ vocals_postprocessing_accordion,
450
+ pitch_shift_accordion,
451
+ original_track,
452
+ vocals_track,
453
+ instrumentals_track,
454
+ main_vocals_track,
455
+ backup_vocals_track,
456
+ main_vocals_dereverbed_track,
457
+ main_vocals_reverb_track,
458
+ converted_vocals_track,
459
+ postprocessed_vocals_track,
460
+ instrumentals_shifted_track,
461
+ backup_vocals_shifted_track,
462
+ ],
463
+ show_progress="hidden",
464
+ )
465
+ generate_event_args_list = [
466
+ EventArgs(
467
+ _run_pipeline_harness,
468
+ inputs=[
469
+ song_input,
470
+ rvc_model,
471
+ pitch_change_vocals,
472
+ pitch_change_all,
473
+ index_rate,
474
+ filter_radius,
475
+ rms_mix_rate,
476
+ protect,
477
+ f0_method,
478
+ crepe_hop_length,
479
+ reverb_rm_size,
480
+ reverb_wet,
481
+ reverb_dry,
482
+ reverb_damping,
483
+ main_gain,
484
+ inst_gain,
485
+ backup_gain,
486
+ output_sr,
487
+ output_format,
488
+ output_name,
489
+ show_intermediate_files,
490
+ ],
491
+ outputs=[
492
+ original_track,
493
+ vocals_track,
494
+ instrumentals_track,
495
+ main_vocals_track,
496
+ backup_vocals_track,
497
+ main_vocals_dereverbed_track,
498
+ main_vocals_reverb_track,
499
+ converted_vocals_track,
500
+ postprocessed_vocals_track,
501
+ instrumentals_shifted_track,
502
+ backup_vocals_shifted_track,
503
+ song_cover_track,
504
+ ],
505
+ ),
506
+ EventArgs(
507
+ partial(
508
+ update_cached_input_songs, 3 + len(song_dir_dropdowns), [], [1]
509
+ ),
510
+ outputs=[
511
+ cached_input_songs_dropdown_1click,
512
+ intermediate_audio_to_delete,
513
+ cached_input_songs_dropdown_multi,
514
+ ]
515
+ + song_dir_dropdowns,
516
+ name="then",
517
+ show_progress="hidden",
518
+ ),
519
+ EventArgs(
520
+ partial(update_output_audio, 1, [], [0]),
521
+ outputs=[output_audio_to_delete],
522
+ name="then",
523
+ show_progress="hidden",
524
+ ),
525
+ ]
526
+ setup_consecutive_event_listeners_with_toggled_interactivity(
527
+ generate_btn,
528
+ generate_event_args_list,
529
+ generate_buttons + [show_intermediate_files],
530
+ )
531
+ clear_btn.click(
532
+ lambda: [
533
+ 0,
534
+ 0,
535
+ 0.5,
536
+ 3,
537
+ 0.25,
538
+ 0.33,
539
+ "rmvpe",
540
+ 128,
541
+ 0.15,
542
+ 0.2,
543
+ 0.8,
544
+ 0.7,
545
+ 0,
546
+ 0,
547
+ 0,
548
+ 44100,
549
+ "mp3",
550
+ False,
551
+ ],
552
+ outputs=[
553
+ pitch_change_vocals,
554
+ pitch_change_all,
555
+ index_rate,
556
+ filter_radius,
557
+ rms_mix_rate,
558
+ protect,
559
+ f0_method,
560
+ crepe_hop_length,
561
+ reverb_rm_size,
562
+ reverb_wet,
563
+ reverb_dry,
564
+ reverb_damping,
565
+ main_gain,
566
+ inst_gain,
567
+ backup_gain,
568
+ output_sr,
569
+ output_format,
570
+ show_intermediate_files,
571
+ ],
572
+ show_progress="hidden",
573
+ )
src/init.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script downloads the models required for running the Ultimmate RVC app.
3
+ """
4
+
5
+ import os
6
+
7
+ import requests
8
+
9
+ from common import RVC_MODELS_DIR
10
+
11
+ RVC_DOWNLOAD_LINK = "https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/"
12
+
13
+
14
+ def dl_model(link: str, model_name: str, dir_name: str) -> None:
15
+ """
16
+ Download a model from a link and save it to a directory.
17
+
18
+ Parameters
19
+ ----------
20
+ link : str
21
+ The link to the site where the model is hosted.
22
+ model_name : str
23
+ The name of the model to download.
24
+ dir_name : str
25
+ The directory to save the model to.
26
+ """
27
+ with requests.get(f"{link}{model_name}") as r:
28
+ r.raise_for_status()
29
+ with open(os.path.join(dir_name, model_name), "wb") as f:
30
+ for chunk in r.iter_content(chunk_size=8192):
31
+ f.write(chunk)
32
+
33
+
34
+ if __name__ == "__main__":
35
+
36
+ rvc_model_names = ["hubert_base.pt", "rmvpe.pt"]
37
+ for model in rvc_model_names:
38
+ print(f"Downloading {model}...")
39
+ dl_model(RVC_DOWNLOAD_LINK, model, RVC_MODELS_DIR)
40
+
41
+ print("All models downloaded!")
src/typings/audio_separator/separator/__init__.pyi ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TypedDict
2
+
3
+ import logging
4
+
5
+ class MDXParams(TypedDict):
6
+ hop_length: int
7
+ segment_size: int
8
+ overlap: float
9
+ batch_size: int
10
+ enable_denoise: bool
11
+
12
+ class VRParams(TypedDict):
13
+ batch_size: int
14
+ window_size: int
15
+ aggression: int
16
+ enable_tta: bool
17
+ enable_post_process: bool
18
+ post_process_threshold: float
19
+ high_end_process: bool
20
+
21
+ class DemucsParams(TypedDict):
22
+ segment_size: str
23
+ shifts: int
24
+ overlap: float
25
+ segments_enabled: bool
26
+
27
+ class MDXCParams(TypedDict):
28
+ segment_size: int
29
+ batch_size: int
30
+ overlap: int
31
+
32
+ class ArchSpecificParams(TypedDict):
33
+ MDX: MDXParams
34
+ VR: VRParams
35
+ Demucs: DemucsParams
36
+ MDXC: MDXCParams
37
+
38
+ class Separator:
39
+ arch_specific_params: ArchSpecificParams
40
+ def __init__(
41
+ self,
42
+ log_level: int = logging.INFO,
43
+ log_formatter: logging.Formatter | None = None,
44
+ model_file_dir: str = "/tmp/audio-separator-models/",
45
+ output_dir: str | None = None,
46
+ output_format: str = "WAV",
47
+ normalization_threshold: float = 0.9,
48
+ output_single_stem: str | None = None,
49
+ invert_using_spec: bool = False,
50
+ sample_rate: int = 44100,
51
+ mdx_params: MDXParams = {
52
+ "hop_length": 1024,
53
+ "segment_size": 256,
54
+ "overlap": 0.25,
55
+ "batch_size": 1,
56
+ "enable_denoise": False,
57
+ },
58
+ vr_params: VRParams = {
59
+ "batch_size": 16,
60
+ "window_size": 512,
61
+ "aggression": 5,
62
+ "enable_tta": False,
63
+ "enable_post_process": False,
64
+ "post_process_threshold": 0.2,
65
+ "high_end_process": False,
66
+ },
67
+ demucs_params: DemucsParams = {
68
+ "segment_size": "Default",
69
+ "shifts": 2,
70
+ "overlap": 0.25,
71
+ "segments_enabled": True,
72
+ },
73
+ mdxc_params: MDXCParams = {"segment_size": 256, "batch_size": 1, "overlap": 8},
74
+ ) -> None: ...
75
+ def load_model(
76
+ self, model_filename: str = "model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt"
77
+ ) -> None: ...
78
+ def separate(self, audio_file_path: str) -> list[str]: ...
src/typings/extra.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Literal, ParamSpec, Sequence, TypedDict, TypeVar
2
+
3
+ from os import PathLike
4
+
5
+ P = ParamSpec("P")
6
+ T = TypeVar("T")
7
+
8
+ StrOrBytesPath = str | bytes | PathLike[str] | PathLike[bytes]
9
+
10
+ DropdownChoices = Sequence[str | int | float | tuple[str, str | int | float]] | None
11
+
12
+ DropdownValue = (
13
+ str | int | float | Sequence[str | int | float] | Callable[..., Any] | None
14
+ )
15
+
16
+ InputType = Literal["yt", "local"]
17
+
18
+ F0Method = Literal["rmvpe", "mangio-crepe"]
19
+
20
+ InputAudioExt = Literal["mp3", "wav", "flac", "aac", "m4a", "ogg"]
21
+
22
+ OutputAudioExt = Literal["mp3", "wav", "flac", "adts", "ipod", "ogg"]
23
+
24
+
25
+ ModelsTable = list[list[str]]
26
+
27
+ ModelsTablePredicate = Callable[[dict[str, str | list[str]]], bool]
28
+
29
+
30
+ class ComponentVisibilityKwArgs(TypedDict):
31
+ visible: bool
32
+ value: Any
33
+
34
+
35
+ class UpdateDropdownArgs(TypedDict, total=False):
36
+ choices: DropdownChoices | None
37
+ value: DropdownValue | None
38
+
39
+
40
+ class TextBoxArgs(TypedDict, total=False):
41
+ value: str | None
42
+ placeholder: str | None
43
+
44
+
45
+ class TransferUpdateArgs(TypedDict, total=False):
46
+ value: str | None
47
+
48
+
49
+ RunPipelineHarnessArgs = tuple[
50
+ str, # song_input
51
+ str, # voice_model
52
+ int, # pitch_change_vocals
53
+ int, # pitch_change_all
54
+ float, # index_rate
55
+ int, # filter_radius
56
+ float, # rms_mix_rate
57
+ float, # protect
58
+ F0Method, # f0_method
59
+ int, # crepe_hop_length
60
+ float, # reverb_rm_size
61
+ float, # reverb_wet
62
+ float, # reverb_dry
63
+ float, # reverb_damping
64
+ int, # main_gain
65
+ int, # inst_gain
66
+ int, # backup_gain
67
+ int, # output_sr
68
+ InputAudioExt, # output_format
69
+ str, # output_name
70
+ bool, # return_files
71
+ ]
src/typings/gradio/__init__.pyi ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio import (
2
+ _simple_templates,
3
+ components,
4
+ layouts,
5
+ processing_utils,
6
+ templates,
7
+ themes,
8
+ )
9
+ from gradio.blocks import Blocks
10
+ from gradio.chat_interface import ChatInterface
11
+ from gradio.components import (
12
+ HTML,
13
+ JSON,
14
+ AnnotatedImage,
15
+ Annotatedimage,
16
+ Audio,
17
+ BarPlot,
18
+ Button,
19
+ Chatbot,
20
+ ChatMessage,
21
+ Checkbox,
22
+ CheckboxGroup,
23
+ Checkboxgroup,
24
+ ClearButton,
25
+ Code,
26
+ ColorPicker,
27
+ DataFrame,
28
+ Dataframe,
29
+ Dataset,
30
+ DateTime,
31
+ DownloadButton,
32
+ Dropdown,
33
+ DuplicateButton,
34
+ File,
35
+ FileExplorer,
36
+ Gallery,
37
+ Highlight,
38
+ HighlightedText,
39
+ Highlightedtext,
40
+ Image,
41
+ ImageEditor,
42
+ Json,
43
+ Label,
44
+ LinePlot,
45
+ LoginButton,
46
+ LogoutButton,
47
+ Markdown,
48
+ MessageDict,
49
+ Model3D,
50
+ MultimodalTextbox,
51
+ Number,
52
+ ParamViewer,
53
+ Plot,
54
+ Radio,
55
+ ScatterPlot,
56
+ Slider,
57
+ State,
58
+ Text,
59
+ Textbox,
60
+ Timer,
61
+ UploadButton,
62
+ Video,
63
+ component,
64
+ )
65
+ from gradio.components.audio import WaveformOptions
66
+ from gradio.components.image_editor import Brush, Eraser
67
+ from gradio.data_classes import FileData
68
+ from gradio.events import (
69
+ DeletedFileData,
70
+ EventData,
71
+ KeyUpData,
72
+ LikeData,
73
+ SelectData,
74
+ on,
75
+ )
76
+ from gradio.exceptions import Error
77
+ from gradio.external import load
78
+ from gradio.flagging import (
79
+ CSVLogger,
80
+ FlaggingCallback,
81
+ HuggingFaceDatasetSaver,
82
+ SimpleCSVLogger,
83
+ )
84
+ from gradio.helpers import (
85
+ Info,
86
+ Progress,
87
+ Warning,
88
+ make_waveform,
89
+ skip,
90
+ update,
91
+ )
92
+ from gradio.helpers import create_examples as Examples # noqa: N812
93
+ from gradio.interface import Interface, TabbedInterface, close_all
94
+ from gradio.layouts import Accordion, Column, Group, Row, Tab, TabItem, Tabs
95
+ from gradio.oauth import OAuthProfile, OAuthToken
96
+ from gradio.renderable import render
97
+ from gradio.routes import Request, mount_gradio_app
98
+ from gradio.templates import (
99
+ Files,
100
+ ImageMask,
101
+ List,
102
+ Matrix,
103
+ Mic,
104
+ Microphone,
105
+ Numpy,
106
+ Paint,
107
+ PlayableVideo,
108
+ Sketchpad,
109
+ TextArea,
110
+ )
111
+ from gradio.themes import Base as Theme
112
+ from gradio.utils import NO_RELOAD, FileSize, get_package_version, set_static_paths
113
+ from gradio.wasm_utils import IS_WASM
114
+
115
+ if not IS_WASM:
116
+ from gradio.cli import deploy
117
+ from gradio.ipython_ext import load_ipython_extension
118
+
119
+ __version__ = ...
120
+ __all__ = [
121
+ "_simple_templates",
122
+ "templates",
123
+ "processing_utils",
124
+ "components",
125
+ "layouts",
126
+ "themes",
127
+ "Blocks",
128
+ "ChatInterface",
129
+ "HTML",
130
+ "JSON",
131
+ "AnnotatedImage",
132
+ "Annotatedimage",
133
+ "Audio",
134
+ "BarPlot",
135
+ "Button",
136
+ "Chatbot",
137
+ "ChatMessage",
138
+ "Checkbox",
139
+ "CheckboxGroup",
140
+ "Checkboxgroup",
141
+ "ClearButton",
142
+ "Code",
143
+ "ColorPicker",
144
+ "DataFrame",
145
+ "Dataframe",
146
+ "Dataset",
147
+ "DateTime",
148
+ "DownloadButton",
149
+ "Dropdown",
150
+ "DuplicateButton",
151
+ "File",
152
+ "FileExplorer",
153
+ "Gallery",
154
+ "Highlight",
155
+ "HighlightedText",
156
+ "Highlightedtext",
157
+ "Image",
158
+ "ImageEditor",
159
+ "Json",
160
+ "Label",
161
+ "LinePlot",
162
+ "LoginButton",
163
+ "LogoutButton",
164
+ "Markdown",
165
+ "MessageDict",
166
+ "Model3D",
167
+ "MultimodalTextbox",
168
+ "Number",
169
+ "ParamViewer",
170
+ "Plot",
171
+ "Radio",
172
+ "ScatterPlot",
173
+ "Slider",
174
+ "State",
175
+ "Text",
176
+ "Textbox",
177
+ "Timer",
178
+ "UploadButton",
179
+ "Video",
180
+ "component",
181
+ "WaveformOptions",
182
+ "Brush",
183
+ "Eraser",
184
+ "FileData",
185
+ "DeletedFileData",
186
+ "EventData",
187
+ "KeyUpData",
188
+ "LikeData",
189
+ "SelectData",
190
+ "on",
191
+ "Error",
192
+ "load",
193
+ "CSVLogger",
194
+ "FlaggingCallback",
195
+ "HuggingFaceDatasetSaver",
196
+ "SimpleCSVLogger",
197
+ "Info",
198
+ "Progress",
199
+ "Warning",
200
+ "make_waveform",
201
+ "skip",
202
+ "update",
203
+ "Examples",
204
+ "Interface",
205
+ "TabbedInterface",
206
+ "close_all",
207
+ "Accordion",
208
+ "Column",
209
+ "Group",
210
+ "Row",
211
+ "Tab",
212
+ "TabItem",
213
+ "Tabs",
214
+ "OAuthProfile",
215
+ "OAuthToken",
216
+ "render",
217
+ "Request",
218
+ "mount_gradio_app",
219
+ "Files",
220
+ "ImageMask",
221
+ "List",
222
+ "Matrix",
223
+ "Mic",
224
+ "Microphone",
225
+ "Numpy",
226
+ "Paint",
227
+ "PlayableVideo",
228
+ "Sketchpad",
229
+ "TextArea",
230
+ "Theme",
231
+ "NO_RELOAD",
232
+ "FileSize",
233
+ "get_package_version",
234
+ "set_static_paths",
235
+ "IS_WASM",
236
+ "deploy",
237
+ "load_ipython_extension",
238
+ ]
src/typings/gradio/events.pyi ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ AbstractSet,
6
+ Any,
7
+ Callable,
8
+ Dict,
9
+ List,
10
+ Literal,
11
+ Self,
12
+ Sequence,
13
+ Union,
14
+ )
15
+
16
+ import dataclasses
17
+
18
+ from gradio.data_classes import FileData, FileDataDict
19
+
20
+ if TYPE_CHECKING:
21
+ from gradio.blocks import Block, BlockContext, Component
22
+ from gradio.components import Timer
23
+
24
+ def set_cancel_events(
25
+ triggers: Sequence[EventListenerMethod],
26
+ cancels: None | dict[str, Any] | list[dict[str, Any]],
27
+ ) -> None: ...
28
+
29
+ class Dependency(dict[Any, Any]):
30
+
31
+ fn: Callable[..., Any]
32
+ associated_timer: Timer | None
33
+ then: Callable[..., Any]
34
+ success: Callable[..., Any]
35
+
36
+ def __init__(
37
+ self,
38
+ trigger: Any,
39
+ key_vals: Any,
40
+ dep_index: int,
41
+ fn: Callable[..., Any],
42
+ associated_timer: Timer | None = ...,
43
+ ) -> None:
44
+ """
45
+ The Dependency object is usualy not created directly but is returned when an event listener is set up. It contains the configuration
46
+ data for the event listener, and can be used to set up additional event listeners that depend on the completion of the current event
47
+ listener using .then() and .success().
48
+
49
+ Demos: chatbot_consecutive, blocks_chained_events
50
+ """
51
+ ...
52
+
53
+ def __call__(self, *args: Any, **kwargs: Any) -> Any: ...
54
+
55
+ class EventData:
56
+ """
57
+ When gr.EventData or one of its subclasses is added as a type hint to an argument of a prediction function, a gr.EventData object will automatically be passed as the value of that argument.
58
+ The attributes of this object contains information about the event that triggered the listener. The gr.EventData object itself contains a `.target` attribute that refers to the component
59
+ that triggered the event, while subclasses of gr.EventData contains additional attributes that are different for each class.
60
+
61
+ Example:
62
+ import gradio as gr
63
+ with gr.Blocks() as demo:
64
+ table = gr.Dataframe([[1, 2, 3], [4, 5, 6]])
65
+ gallery = gr.Gallery([("cat.jpg", "Cat"), ("dog.jpg", "Dog")])
66
+ textbox = gr.Textbox("Hello World!")
67
+ statement = gr.Textbox()
68
+ def on_select(value, evt: gr.EventData):
69
+ return f"The {evt.target} component was selected, and its value was {value}."
70
+ table.select(on_select, table, statement)
71
+ gallery.select(on_select, gallery, statement)
72
+ textbox.select(on_select, textbox, statement)
73
+ demo.launch()
74
+ Demos: gallery_selections, tictactoe
75
+ """
76
+
77
+ target: Block | None
78
+ _data: Any
79
+
80
+ def __init__(self, target: Block | None, _data: Any) -> None:
81
+ """
82
+ Parameters:
83
+ target: The component object that triggered the event. Can be used to distinguish multiple components bound to the same listener.
84
+ """
85
+ ...
86
+
87
+ class SelectData(EventData):
88
+ """
89
+ The gr.SelectData class is a subclass of gr.EventData that specifically carries information about the `.select()` event. When gr.SelectData
90
+ is added as a type hint to an argument of an event listener method, a gr.SelectData object will automatically be passed as the value of that argument.
91
+ The attributes of this object contains information about the event that triggered the listener.
92
+
93
+ Example:
94
+ import gradio as gr
95
+ with gr.Blocks() as demo:
96
+ table = gr.Dataframe([[1, 2, 3], [4, 5, 6]])
97
+ gallery = gr.Gallery([("cat.jpg", "Cat"), ("dog.jpg", "Dog")])
98
+ textbox = gr.Textbox("Hello World!")
99
+ statement = gr.Textbox()
100
+ def on_select(evt: gr.SelectData):
101
+ return f"You selected {evt.value} at {evt.index} from {evt.target}"
102
+ table.select(on_select, table, statement)
103
+ gallery.select(on_select, gallery, statement)
104
+ textbox.select(on_select, textbox, statement)
105
+ demo.launch()
106
+ Demos: gallery_selections, tictactoe
107
+ """
108
+
109
+ index: int | tuple[int, int]
110
+ value: Any
111
+ row_value: list[Any] | None
112
+ col_value: list[Any] | None
113
+ selected: bool
114
+
115
+ def __init__(self, target: Block | None, data: Any) -> None: ...
116
+
117
+ class KeyUpData(EventData):
118
+ """
119
+ The gr.KeyUpData class is a subclass of gr.EventData that specifically carries information about the `.key_up()` event. When gr.KeyUpData
120
+ is added as a type hint to an argument of an event listener method, a gr.KeyUpData object will automatically be passed as the value of that argument.
121
+ The attributes of this object contains information about the event that triggered the listener.
122
+
123
+ Example:
124
+ import gradio as gr
125
+ def test(value, key_up_data: gr.KeyUpData):
126
+ return {
127
+ "component value": value,
128
+ "input value": key_up_data.input_value,
129
+ "key": key_up_data.key
130
+ }
131
+ with gr.Blocks() as demo:
132
+ d = gr.Dropdown(["abc", "def"], allow_custom_value=True)
133
+ t = gr.JSON()
134
+ d.key_up(test, d, t)
135
+ demo.launch()
136
+ Demos: dropdown_key_up
137
+ """
138
+
139
+ key: str
140
+ input_value: str
141
+
142
+ def __init__(self, target: Block | None, data: Any) -> None: ...
143
+
144
+ class DeletedFileData(EventData):
145
+ """
146
+ The gr.DeletedFileData class is a subclass of gr.EventData that specifically carries information about the `.delete()` event. When gr.DeletedFileData
147
+ is added as a type hint to an argument of an event listener method, a gr.DeletedFileData object will automatically be passed as the value of that argument.
148
+ The attributes of this object contains information about the event that triggered the listener.
149
+ Example:
150
+ import gradio as gr
151
+ def test(delete_data: gr.DeletedFileData):
152
+ return delete_data.file.path
153
+ with gr.Blocks() as demo:
154
+ files = gr.File(file_count="multiple")
155
+ deleted_file = gr.File()
156
+ files.delete(test, None, deleted_file)
157
+ demo.launch()
158
+ Demos: file_component_events
159
+ """
160
+
161
+ file: FileData
162
+
163
+ def __init__(self, target: Block | None, data: FileDataDict) -> None: ...
164
+
165
+ class LikeData(EventData):
166
+ """
167
+ The gr.LikeData class is a subclass of gr.EventData that specifically carries information about the `.like()` event. When gr.LikeData
168
+ is added as a type hint to an argument of an event listener method, a gr.LikeData object will automatically be passed as the value of that argument.
169
+ The attributes of this object contains information about the event that triggered the listener.
170
+ Example:
171
+ import gradio as gr
172
+ def test(value, like_data: gr.LikeData):
173
+ return {
174
+ "chatbot_value": value,
175
+ "liked_message": like_data.value,
176
+ "liked_index": like_data.index,
177
+ "liked_or_disliked_as_bool": like_data.liked
178
+ }
179
+ with gr.Blocks() as demo:
180
+ c = gr.Chatbot([("abc", "def")])
181
+ t = gr.JSON()
182
+ c.like(test, c, t)
183
+ demo.launch()
184
+ Demos: chatbot_core_components_simple
185
+ """
186
+
187
+ index: int | tuple[int, int]
188
+ value: Any
189
+ liked: bool
190
+
191
+ def __init__(self, target: Block | None, data: Any) -> None: ...
192
+
193
+ @dataclasses.dataclass
194
+ class EventListenerMethod:
195
+ block: Block | None
196
+ event_name: str
197
+
198
+ if TYPE_CHECKING:
199
+ EventListenerCallable = Callable[
200
+ [
201
+ Union[Callable[..., Any], None],
202
+ Union[Component, Sequence[Component], None],
203
+ Union[Block, Sequence[Block], Sequence[Component], Component, None],
204
+ Union[str, None, Literal[False]],
205
+ bool,
206
+ Literal["full", "minimal", "hidden"],
207
+ Union[bool, None],
208
+ bool,
209
+ int,
210
+ bool,
211
+ bool,
212
+ Union[Dict[str, Any], List[Dict[str, Any]], None],
213
+ Union[float, None],
214
+ Union[Literal["once", "multiple", "always_last"], None],
215
+ Union[str, None],
216
+ Union[int, None, Literal["default"]],
217
+ Union[str, None],
218
+ bool,
219
+ ],
220
+ Dependency,
221
+ ]
222
+
223
+ class EventListener(str):
224
+ has_trigger: bool
225
+ config_data: Callable[..., dict[str, Any]]
226
+ event_name: str
227
+ show_progress: Literal["full", "minimal", "hidden"]
228
+ trigger_after: int | None
229
+ trigger_only_on_success: bool
230
+ callback: Callable[..., Any] | None
231
+ doc: str
232
+ listener: Callable[..., Dependency]
233
+
234
+ def __new__(cls, event_name: str, *_args: Any, **_kwargs: Any) -> Self: ...
235
+ def __init__(
236
+ self,
237
+ event_name: str,
238
+ has_trigger: bool = ...,
239
+ config_data: Callable[..., dict[str, Any]] = ...,
240
+ show_progress: Literal["full", "minimal", "hidden"] = ...,
241
+ callback: Callable[..., Any] | None = ...,
242
+ trigger_after: int | None = ...,
243
+ trigger_only_on_success: bool = ...,
244
+ doc: str = ...,
245
+ ) -> None: ...
246
+ def set_doc(self, component: str) -> None: ...
247
+ def copy(self) -> EventListener: ...
248
+ @staticmethod
249
+ def _setup(
250
+ _event_name: str,
251
+ _has_trigger: bool,
252
+ _show_progress: Literal["full", "minimal", "hidden"],
253
+ _callback: Callable[..., Any] | None,
254
+ _trigger_after: int | None,
255
+ _trigger_only_on_success: bool,
256
+ ) -> Callable[..., Dependency]: ...
257
+
258
+ def on(
259
+ triggers: Sequence[EventListenerCallable] | EventListenerCallable | None = ...,
260
+ fn: Callable[..., Any] | None | Literal["decorator"] = ...,
261
+ inputs: (
262
+ Component
263
+ | BlockContext
264
+ | Sequence[Component | BlockContext]
265
+ | AbstractSet[Component | BlockContext]
266
+ | None
267
+ ) = ...,
268
+ outputs: (
269
+ Component
270
+ | BlockContext
271
+ | Sequence[Component | BlockContext]
272
+ | AbstractSet[Component | BlockContext]
273
+ | None
274
+ ) = ...,
275
+ *,
276
+ api_name: str | None | Literal[False] = ...,
277
+ scroll_to_output: bool = ...,
278
+ show_progress: Literal["full", "minimal", "hidden"] = ...,
279
+ queue: bool = ...,
280
+ batch: bool = ...,
281
+ max_batch_size: int = ...,
282
+ preprocess: bool = ...,
283
+ postprocess: bool = ...,
284
+ cancels: dict[str, Any] | list[dict[str, Any]] | None = ...,
285
+ trigger_mode: Literal["once", "multiple", "always_last"] | None = ...,
286
+ every: float | None = ...,
287
+ js: str | None = ...,
288
+ concurrency_limit: int | None | Literal["default"] = ...,
289
+ concurrency_id: str | None = ...,
290
+ show_api: bool = ...,
291
+ ) -> Dependency:
292
+ """
293
+ Sets up an event listener that triggers a function when the specified event(s) occur. This is especially
294
+ useful when the same function should be triggered by multiple events. Only a single API endpoint is generated
295
+ for all events in the triggers list.
296
+
297
+ Parameters:
298
+ triggers: List of triggers to listen to, e.g. [btn.click, number.change]. If None, will listen to changes to any inputs.
299
+ fn: the function to call when this event is triggered. Often a machine learning model's prediction function. Each parameter of the function corresponds to one input component, and the function should return a single value or a tuple of values, with each element in the tuple corresponding to one output component.
300
+ inputs: List of gradio.components to use as inputs. If the function takes no inputs, this should be an empty list.
301
+ outputs: List of gradio.components to use as outputs. If the function returns no outputs, this should be an empty list.
302
+ api_name: Defines how the endpoint appears in the API docs. Can be a string, None, or False. If False, the endpoint will not be exposed in the api docs. If set to None, the endpoint will be exposed in the api docs as an unnamed endpoint, although this behavior will be changed in Gradio 4.0. If set to a string, the endpoint will be exposed in the api docs with the given name.
303
+ scroll_to_output: If True, will scroll to output component on completion
304
+ show_progress: how to show the progress animation while event is running: "full" shows a spinner which covers the output component area as well as a runtime display in the upper right corner, "minimal" only shows the runtime display, "hidden" shows no progress animation at all
305
+ queue: If True, will place the request on the queue, if the queue has been enabled. If False, will not put this event on the queue, even if the queue has been enabled. If None, will use the queue setting of the gradio app.
306
+ batch: If True, then the function should process a batch of inputs, meaning that it should accept a list of input values for each parameter. The lists should be of equal length (and be up to length `max_batch_size`). The function is then *required* to return a tuple of lists (even if there is only 1 output component), with each list in the tuple corresponding to one output component.
307
+ max_batch_size: Maximum number of inputs to batch together if this is called from the queue (only relevant if batch=True)
308
+ preprocess: If False, will not run preprocessing of component data before running 'fn' (e.g. leaving it as a base64 string if this method is called with the `Image` component).
309
+ postprocess: If False, will not run postprocessing of component data before returning 'fn' output to the browser.
310
+ cancels: A list of other events to cancel when this listener is triggered. For example, setting cancels=[click_event] will cancel the click_event, where click_event is the return value of another components .click method. Functions that have not yet run (or generators that are iterating) will be cancelled, but functions that are currently running will be allowed to finish.
311
+ trigger_mode: If "once" (default for all events except `.change()`) would not allow any submissions while an event is pending. If set to "multiple", unlimited submissions are allowed while pending, and "always_last" (default for `.change()` and `.key_up()` events) would allow a second submission after the pending event is complete.
312
+ every: Will be deprecated in favor of gr.Timer. Run this event 'every' number of seconds while the client connection is open. Interpreted in seconds.
313
+ js: Optional frontend js method to run before running 'fn'. Input arguments for js method are values of 'inputs', return should be a list of values for output components.
314
+ concurrency_limit: If set, this is the maximum number of this event that can be running simultaneously. Can be set to None to mean no concurrency_limit (any number of this event can be running simultaneously). Set to "default" to use the default concurrency limit (defined by the `default_concurrency_limit` parameter in `Blocks.queue()`, which itself is 1 by default).
315
+ concurrency_id: If set, this is the id of the concurrency group. Events with the same concurrency_id will be limited by the lowest set concurrency_limit.
316
+ show_api: whether to show this event in the "view API" page of the Gradio app, or in the ".view_api()" method of the Gradio clients. Unlike setting api_name to False, setting show_api to False will still allow downstream apps as well as the Clients to use this event. If fn is None, show_api will automatically be set to False.
317
+ Example:
318
+ import gradio as gr
319
+ with gr.Blocks() as demo:
320
+ with gr.Row():
321
+ input = gr.Textbox()
322
+ button = gr.Button("Submit")
323
+ output = gr.Textbox()
324
+ gr.on(
325
+ triggers=[button.click, input.submit],
326
+ fn=lambda x: x,
327
+ inputs=[input],
328
+ outputs=[output]
329
+ )
330
+ demo.launch()
331
+ """
332
+ ...
333
+
334
+ class Events:
335
+ change: EventListener
336
+ input: EventListener
337
+ click: EventListener
338
+ double_click: EventListener
339
+ submit: EventListener
340
+ edit: EventListener
341
+ clear: EventListener
342
+ play: EventListener
343
+ pause: EventListener
344
+ stop: EventListener
345
+ end: EventListener
346
+ start_recording: EventListener
347
+ pause_recording: EventListener
348
+ stop_recording: EventListener
349
+ focus: EventListener
350
+ blur: EventListener
351
+ upload: EventListener
352
+ release: EventListener
353
+ select: EventListener
354
+ stream: EventListener
355
+ like: EventListener
356
+ load: EventListener
357
+ key_up: EventListener
358
+ apply: EventListener
359
+ delete: EventListener
360
+ tick: EventListener
361
+
362
+ __all__ = [
363
+ "set_cancel_events",
364
+ "Dependency",
365
+ "EventData",
366
+ "SelectData",
367
+ "KeyUpData",
368
+ "DeletedFileData",
369
+ "LikeData",
370
+ "EventListenerMethod",
371
+ "EventListener",
372
+ "on",
373
+ "Events",
374
+ ]
src/typings/pedalboard_native/io/__init__.pyi ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal, overload
2
+
3
+ import numpy as np
4
+ from numpy.typing import NDArray
5
+
6
+ class AudioFile:
7
+ @staticmethod
8
+ @overload
9
+ def __new__(
10
+ cls: object, filename: str, mode: Literal["r"] = "r"
11
+ ) -> ReadableAudioFile: ...
12
+ @staticmethod
13
+ @overload
14
+ def __new__(
15
+ cls: object,
16
+ filename: str,
17
+ mode: Literal["w"],
18
+ samplerate: float | None = None,
19
+ num_channels: int = 1,
20
+ bit_depth: int = 16,
21
+ quality: str | float | None = None,
22
+ ) -> WriteableAudioFile: ...
23
+
24
+ class ReadableAudioFile(AudioFile):
25
+ def __enter__(self) -> ReadableAudioFile: ...
26
+ def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
27
+ def read(self, num_frames: float | int = 0) -> NDArray[np.float32]: ...
28
+ def tell(self) -> int: ...
29
+ @property
30
+ def frames(self) -> int: ...
31
+ @property
32
+ def num_channels(self) -> int: ...
33
+ @property
34
+ def samplerate(self) -> float | int: ...
35
+
36
+ class WriteableAudioFile(AudioFile):
37
+ def __enter__(self) -> WriteableAudioFile: ...
38
+ def __exit__(self, arg0: object, arg1: object, arg2: object) -> None: ...
39
+ def write(self, samples: NDArray[...]) -> None: ...
src/typings/soundfile/__init__.pyi ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from os import PathLike
4
+
5
+ import numpy as np
6
+ from numpy.typing import NDArray
7
+
8
+ DEFAULT_NDARRAY = NDArray[np.float64 | np.float32 | np.int32 | np.int16]
9
+
10
+ def read(
11
+ file: int | str | PathLike[str] | PathLike[bytes],
12
+ frames: int = -1,
13
+ start: int = 0,
14
+ stop: int | None = None,
15
+ dtype: Literal["float64", "float32", "int32", "int16"] = "float64",
16
+ always_2d: bool = False,
17
+ fill_value: float | None = None,
18
+ out: DEFAULT_NDARRAY | None = None,
19
+ samplerate: int | None = None,
20
+ channels: int | None = None,
21
+ format: str | None = None,
22
+ subtype: str | None = None,
23
+ endian: Literal["FILE", "LITTLE", "BIG", "CPU"] | None = None,
24
+ closefd: bool | None = True,
25
+ ) -> tuple[DEFAULT_NDARRAY, int]: ...
26
+ def write(
27
+ file: int | str | PathLike[str] | PathLike[bytes],
28
+ data: DEFAULT_NDARRAY,
29
+ samplerate: int,
30
+ subtype: str | None = None,
31
+ endian: Literal["FILE", "LITTLE", "BIG", "CPU"] | None = None,
32
+ format: str | None = None,
33
+ closefd: bool | None = True,
34
+ ) -> None: ...
src/typings/sox/__init__.pyi ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Self
2
+
3
+ from pathlib import Path
4
+
5
+ from numpy.typing import NDArray
6
+
7
+ class Transformer:
8
+ def pitch(self, n_semitones: float, quick: bool = False) -> Self: ...
9
+ def build_array(
10
+ self,
11
+ input_filepath: str | Path | None = None,
12
+ input_array: NDArray[...] | None = None,
13
+ sample_rate_in: float | None = None,
14
+ extra_args: list[str] | None = None,
15
+ ) -> NDArray[...]: ...
src/typings/yt_dlp/__init__.pyi ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Self
2
+
3
+ class YoutubeDL:
4
+ def __init__(
5
+ self, params: dict[str, Any] | None = None, auto_init: bool = True
6
+ ) -> None: ...
7
+ def extract_info(
8
+ self,
9
+ url: str,
10
+ download: bool = True,
11
+ ie_key: str | None = None,
12
+ extra_info: dict[str, Any] | None = None,
13
+ process: bool = True,
14
+ force_generic_extractor: bool = False,
15
+ ) -> dict[str, Any]: ...
16
+ def prepare_filename(
17
+ self,
18
+ info_dict: dict[str, Any],
19
+ dir_type: str = "",
20
+ *,
21
+ outtmpl: str | None = None,
22
+ warn: bool = False,
23
+ ) -> str: ...
24
+ def __enter__(self) -> Self: ...
25
+ def __exit__(self, *args: Any) -> None: ...
src/vc/configs/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
src/vc/configs/32k_v2.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
src/vc/configs/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
src/vc/configs/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
src/vc/configs/48k_v2.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
src/vc/infer_pack/attentions.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from vc.infer_pack import commons
9
+ from vc.infer_pack import modules
10
+ from vc.infer_pack.modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(
15
+ self,
16
+ hidden_channels,
17
+ filter_channels,
18
+ n_heads,
19
+ n_layers,
20
+ kernel_size=1,
21
+ p_dropout=0.0,
22
+ window_size=10,
23
+ **kwargs
24
+ ):
25
+ super().__init__()
26
+ self.hidden_channels = hidden_channels
27
+ self.filter_channels = filter_channels
28
+ self.n_heads = n_heads
29
+ self.n_layers = n_layers
30
+ self.kernel_size = kernel_size
31
+ self.p_dropout = p_dropout
32
+ self.window_size = window_size
33
+
34
+ self.drop = nn.Dropout(p_dropout)
35
+ self.attn_layers = nn.ModuleList()
36
+ self.norm_layers_1 = nn.ModuleList()
37
+ self.ffn_layers = nn.ModuleList()
38
+ self.norm_layers_2 = nn.ModuleList()
39
+ for i in range(self.n_layers):
40
+ self.attn_layers.append(
41
+ MultiHeadAttention(
42
+ hidden_channels,
43
+ hidden_channels,
44
+ n_heads,
45
+ p_dropout=p_dropout,
46
+ window_size=window_size,
47
+ )
48
+ )
49
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
50
+ self.ffn_layers.append(
51
+ FFN(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ filter_channels,
55
+ kernel_size,
56
+ p_dropout=p_dropout,
57
+ )
58
+ )
59
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
60
+
61
+ def forward(self, x, x_mask):
62
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
63
+ x = x * x_mask
64
+ for i in range(self.n_layers):
65
+ y = self.attn_layers[i](x, x, attn_mask)
66
+ y = self.drop(y)
67
+ x = self.norm_layers_1[i](x + y)
68
+
69
+ y = self.ffn_layers[i](x, x_mask)
70
+ y = self.drop(y)
71
+ x = self.norm_layers_2[i](x + y)
72
+ x = x * x_mask
73
+ return x
74
+
75
+
76
+ class Decoder(nn.Module):
77
+ def __init__(
78
+ self,
79
+ hidden_channels,
80
+ filter_channels,
81
+ n_heads,
82
+ n_layers,
83
+ kernel_size=1,
84
+ p_dropout=0.0,
85
+ proximal_bias=False,
86
+ proximal_init=True,
87
+ **kwargs
88
+ ):
89
+ super().__init__()
90
+ self.hidden_channels = hidden_channels
91
+ self.filter_channels = filter_channels
92
+ self.n_heads = n_heads
93
+ self.n_layers = n_layers
94
+ self.kernel_size = kernel_size
95
+ self.p_dropout = p_dropout
96
+ self.proximal_bias = proximal_bias
97
+ self.proximal_init = proximal_init
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.self_attn_layers = nn.ModuleList()
101
+ self.norm_layers_0 = nn.ModuleList()
102
+ self.encdec_attn_layers = nn.ModuleList()
103
+ self.norm_layers_1 = nn.ModuleList()
104
+ self.ffn_layers = nn.ModuleList()
105
+ self.norm_layers_2 = nn.ModuleList()
106
+ for i in range(self.n_layers):
107
+ self.self_attn_layers.append(
108
+ MultiHeadAttention(
109
+ hidden_channels,
110
+ hidden_channels,
111
+ n_heads,
112
+ p_dropout=p_dropout,
113
+ proximal_bias=proximal_bias,
114
+ proximal_init=proximal_init,
115
+ )
116
+ )
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(
119
+ MultiHeadAttention(
120
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
121
+ )
122
+ )
123
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
124
+ self.ffn_layers.append(
125
+ FFN(
126
+ hidden_channels,
127
+ hidden_channels,
128
+ filter_channels,
129
+ kernel_size,
130
+ p_dropout=p_dropout,
131
+ causal=True,
132
+ )
133
+ )
134
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
135
+
136
+ def forward(self, x, x_mask, h, h_mask):
137
+ """
138
+ x: decoder input
139
+ h: encoder output
140
+ """
141
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
142
+ device=x.device, dtype=x.dtype
143
+ )
144
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
145
+ x = x * x_mask
146
+ for i in range(self.n_layers):
147
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
148
+ y = self.drop(y)
149
+ x = self.norm_layers_0[i](x + y)
150
+
151
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_1[i](x + y)
154
+
155
+ y = self.ffn_layers[i](x, x_mask)
156
+ y = self.drop(y)
157
+ x = self.norm_layers_2[i](x + y)
158
+ x = x * x_mask
159
+ return x
160
+
161
+
162
+ class MultiHeadAttention(nn.Module):
163
+ def __init__(
164
+ self,
165
+ channels,
166
+ out_channels,
167
+ n_heads,
168
+ p_dropout=0.0,
169
+ window_size=None,
170
+ heads_share=True,
171
+ block_length=None,
172
+ proximal_bias=False,
173
+ proximal_init=False,
174
+ ):
175
+ super().__init__()
176
+ assert channels % n_heads == 0
177
+
178
+ self.channels = channels
179
+ self.out_channels = out_channels
180
+ self.n_heads = n_heads
181
+ self.p_dropout = p_dropout
182
+ self.window_size = window_size
183
+ self.heads_share = heads_share
184
+ self.block_length = block_length
185
+ self.proximal_bias = proximal_bias
186
+ self.proximal_init = proximal_init
187
+ self.attn = None
188
+
189
+ self.k_channels = channels // n_heads
190
+ self.conv_q = nn.Conv1d(channels, channels, 1)
191
+ self.conv_k = nn.Conv1d(channels, channels, 1)
192
+ self.conv_v = nn.Conv1d(channels, channels, 1)
193
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
194
+ self.drop = nn.Dropout(p_dropout)
195
+
196
+ if window_size is not None:
197
+ n_heads_rel = 1 if heads_share else n_heads
198
+ rel_stddev = self.k_channels**-0.5
199
+ self.emb_rel_k = nn.Parameter(
200
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
201
+ * rel_stddev
202
+ )
203
+ self.emb_rel_v = nn.Parameter(
204
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
+ * rel_stddev
206
+ )
207
+
208
+ nn.init.xavier_uniform_(self.conv_q.weight)
209
+ nn.init.xavier_uniform_(self.conv_k.weight)
210
+ nn.init.xavier_uniform_(self.conv_v.weight)
211
+ if proximal_init:
212
+ with torch.no_grad():
213
+ self.conv_k.weight.copy_(self.conv_q.weight)
214
+ self.conv_k.bias.copy_(self.conv_q.bias)
215
+
216
+ def forward(self, x, c, attn_mask=None):
217
+ q = self.conv_q(x)
218
+ k = self.conv_k(c)
219
+ v = self.conv_v(c)
220
+
221
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
222
+
223
+ x = self.conv_o(x)
224
+ return x
225
+
226
+ def attention(self, query, key, value, mask=None):
227
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
228
+ b, d, t_s, t_t = (*key.size(), query.size(2))
229
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
230
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
231
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
232
+
233
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
234
+ if self.window_size is not None:
235
+ assert (
236
+ t_s == t_t
237
+ ), "Relative attention is only available for self-attention."
238
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
239
+ rel_logits = self._matmul_with_relative_keys(
240
+ query / math.sqrt(self.k_channels), key_relative_embeddings
241
+ )
242
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
243
+ scores = scores + scores_local
244
+ if self.proximal_bias:
245
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
246
+ scores = scores + self._attention_bias_proximal(t_s).to(
247
+ device=scores.device, dtype=scores.dtype
248
+ )
249
+ if mask is not None:
250
+ scores = scores.masked_fill(mask == 0, -1e4)
251
+ if self.block_length is not None:
252
+ assert (
253
+ t_s == t_t
254
+ ), "Local attention is only available for self-attention."
255
+ block_mask = (
256
+ torch.ones_like(scores)
257
+ .triu(-self.block_length)
258
+ .tril(self.block_length)
259
+ )
260
+ scores = scores.masked_fill(block_mask == 0, -1e4)
261
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
262
+ p_attn = self.drop(p_attn)
263
+ output = torch.matmul(p_attn, value)
264
+ if self.window_size is not None:
265
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
266
+ value_relative_embeddings = self._get_relative_embeddings(
267
+ self.emb_rel_v, t_s
268
+ )
269
+ output = output + self._matmul_with_relative_values(
270
+ relative_weights, value_relative_embeddings
271
+ )
272
+ output = (
273
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
274
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
275
+ return output, p_attn
276
+
277
+ def _matmul_with_relative_values(self, x, y):
278
+ """
279
+ x: [b, h, l, m]
280
+ y: [h or 1, m, d]
281
+ ret: [b, h, l, d]
282
+ """
283
+ ret = torch.matmul(x, y.unsqueeze(0))
284
+ return ret
285
+
286
+ def _matmul_with_relative_keys(self, x, y):
287
+ """
288
+ x: [b, h, l, d]
289
+ y: [h or 1, m, d]
290
+ ret: [b, h, l, m]
291
+ """
292
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
293
+ return ret
294
+
295
+ def _get_relative_embeddings(self, relative_embeddings, length):
296
+ max_relative_position = 2 * self.window_size + 1
297
+ # Pad first before slice to avoid using cond ops.
298
+ pad_length = max(length - (self.window_size + 1), 0)
299
+ slice_start_position = max((self.window_size + 1) - length, 0)
300
+ slice_end_position = slice_start_position + 2 * length - 1
301
+ if pad_length > 0:
302
+ padded_relative_embeddings = F.pad(
303
+ relative_embeddings,
304
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
305
+ )
306
+ else:
307
+ padded_relative_embeddings = relative_embeddings
308
+ used_relative_embeddings = padded_relative_embeddings[
309
+ :, slice_start_position:slice_end_position
310
+ ]
311
+ return used_relative_embeddings
312
+
313
+ def _relative_position_to_absolute_position(self, x):
314
+ """
315
+ x: [b, h, l, 2*l-1]
316
+ ret: [b, h, l, l]
317
+ """
318
+ batch, heads, length, _ = x.size()
319
+ # Concat columns of pad to shift from relative to absolute indexing.
320
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
321
+
322
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
323
+ x_flat = x.view([batch, heads, length * 2 * length])
324
+ x_flat = F.pad(
325
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
326
+ )
327
+
328
+ # Reshape and slice out the padded elements.
329
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
330
+ :, :, :length, length - 1 :
331
+ ]
332
+ return x_final
333
+
334
+ def _absolute_position_to_relative_position(self, x):
335
+ """
336
+ x: [b, h, l, l]
337
+ ret: [b, h, l, 2*l-1]
338
+ """
339
+ batch, heads, length, _ = x.size()
340
+ # padd along column
341
+ x = F.pad(
342
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
343
+ )
344
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
345
+ # add 0's in the beginning that will skew the elements after reshape
346
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
347
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
348
+ return x_final
349
+
350
+ def _attention_bias_proximal(self, length):
351
+ """Bias for self-attention to encourage attention to close positions.
352
+ Args:
353
+ length: an integer scalar.
354
+ Returns:
355
+ a Tensor with shape [1, 1, length, length]
356
+ """
357
+ r = torch.arange(length, dtype=torch.float32)
358
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
359
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
360
+
361
+
362
+ class FFN(nn.Module):
363
+ def __init__(
364
+ self,
365
+ in_channels,
366
+ out_channels,
367
+ filter_channels,
368
+ kernel_size,
369
+ p_dropout=0.0,
370
+ activation=None,
371
+ causal=False,
372
+ ):
373
+ super().__init__()
374
+ self.in_channels = in_channels
375
+ self.out_channels = out_channels
376
+ self.filter_channels = filter_channels
377
+ self.kernel_size = kernel_size
378
+ self.p_dropout = p_dropout
379
+ self.activation = activation
380
+ self.causal = causal
381
+
382
+ if causal:
383
+ self.padding = self._causal_padding
384
+ else:
385
+ self.padding = self._same_padding
386
+
387
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
388
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
389
+ self.drop = nn.Dropout(p_dropout)
390
+
391
+ def forward(self, x, x_mask):
392
+ x = self.conv_1(self.padding(x * x_mask))
393
+ if self.activation == "gelu":
394
+ x = x * torch.sigmoid(1.702 * x)
395
+ else:
396
+ x = torch.relu(x)
397
+ x = self.drop(x)
398
+ x = self.conv_2(self.padding(x * x_mask))
399
+ return x * x_mask
400
+
401
+ def _causal_padding(self, x):
402
+ if self.kernel_size == 1:
403
+ return x
404
+ pad_l = self.kernel_size - 1
405
+ pad_r = 0
406
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
407
+ x = F.pad(x, commons.convert_pad_shape(padding))
408
+ return x
409
+
410
+ def _same_padding(self, x):
411
+ if self.kernel_size == 1:
412
+ return x
413
+ pad_l = (self.kernel_size - 1) // 2
414
+ pad_r = self.kernel_size // 2
415
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
416
+ x = F.pad(x, commons.convert_pad_shape(padding))
417
+ return x
src/vc/infer_pack/commons.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size * dilation - dilation) / 2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
25
+ """KL(P||Q)"""
26
+ kl = (logs_q - logs_p) - 0.5
27
+ kl += (
28
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
29
+ )
30
+ return kl
31
+
32
+
33
+ def rand_gumbel(shape):
34
+ """Sample from the Gumbel distribution, protect from overflows."""
35
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
36
+ return -torch.log(-torch.log(uniform_samples))
37
+
38
+
39
+ def rand_gumbel_like(x):
40
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
41
+ return g
42
+
43
+
44
+ def slice_segments(x, ids_str, segment_size=4):
45
+ ret = torch.zeros_like(x[:, :, :segment_size])
46
+ for i in range(x.size(0)):
47
+ idx_str = ids_str[i]
48
+ idx_end = idx_str + segment_size
49
+ ret[i] = x[i, :, idx_str:idx_end]
50
+ return ret
51
+
52
+
53
+ def slice_segments2(x, ids_str, segment_size=4):
54
+ ret = torch.zeros_like(x[:, :segment_size])
55
+ for i in range(x.size(0)):
56
+ idx_str = ids_str[i]
57
+ idx_end = idx_str + segment_size
58
+ ret[i] = x[i, idx_str:idx_end]
59
+ return ret
60
+
61
+
62
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
63
+ b, d, t = x.size()
64
+ if x_lengths is None:
65
+ x_lengths = t
66
+ ids_str_max = x_lengths - segment_size + 1
67
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
68
+ ret = slice_segments(x, ids_str, segment_size)
69
+ return ret, ids_str
70
+
71
+
72
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
73
+ position = torch.arange(length, dtype=torch.float)
74
+ num_timescales = channels // 2
75
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
76
+ num_timescales - 1
77
+ )
78
+ inv_timescales = min_timescale * torch.exp(
79
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
80
+ )
81
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
82
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
83
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
84
+ signal = signal.view(1, channels, length)
85
+ return signal
86
+
87
+
88
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
89
+ b, channels, length = x.size()
90
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
91
+ return x + signal.to(dtype=x.dtype, device=x.device)
92
+
93
+
94
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
95
+ b, channels, length = x.size()
96
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
97
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
98
+
99
+
100
+ def subsequent_mask(length):
101
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
102
+ return mask
103
+
104
+
105
+ @torch.jit.script
106
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
107
+ n_channels_int = n_channels[0]
108
+ in_act = input_a + input_b
109
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
110
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
111
+ acts = t_act * s_act
112
+ return acts
113
+
114
+
115
+ def convert_pad_shape(pad_shape):
116
+ l = pad_shape[::-1]
117
+ pad_shape = [item for sublist in l for item in sublist]
118
+ return pad_shape
119
+
120
+
121
+ def shift_1d(x):
122
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
123
+ return x
124
+
125
+
126
+ def sequence_mask(length, max_length=None):
127
+ if max_length is None:
128
+ max_length = length.max()
129
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
130
+ return x.unsqueeze(0) < length.unsqueeze(1)
131
+
132
+
133
+ def generate_path(duration, mask):
134
+ """
135
+ duration: [b, 1, t_x]
136
+ mask: [b, 1, t_y, t_x]
137
+ """
138
+ device = duration.device
139
+
140
+ b, _, t_y, t_x = mask.shape
141
+ cum_duration = torch.cumsum(duration, -1)
142
+
143
+ cum_duration_flat = cum_duration.view(b * t_x)
144
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
145
+ path = path.view(b, t_x, t_y)
146
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
147
+ path = path.unsqueeze(1).transpose(2, 3) * mask
148
+ return path
149
+
150
+
151
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
152
+ if isinstance(parameters, torch.Tensor):
153
+ parameters = [parameters]
154
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
155
+ norm_type = float(norm_type)
156
+ if clip_value is not None:
157
+ clip_value = float(clip_value)
158
+
159
+ total_norm = 0
160
+ for p in parameters:
161
+ param_norm = p.grad.data.norm(norm_type)
162
+ total_norm += param_norm.item() ** norm_type
163
+ if clip_value is not None:
164
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
165
+ total_norm = total_norm ** (1.0 / norm_type)
166
+ return total_norm
src/vc/infer_pack/models.py ADDED
@@ -0,0 +1,1128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from vc.infer_pack import modules
7
+ from vc.infer_pack import attentions
8
+ from vc.infer_pack import commons
9
+ from vc.infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from vc.infer_pack.commons import init_weights
13
+ import numpy as np
14
+ from vc.infer_pack import commons
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (
337
+ f0_buf / self.sampling_rate
338
+ ) % 1 ###%1意味着n_har的乘积无法后处理优化
339
+ rand_ini = torch.rand(
340
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
341
+ )
342
+ rand_ini[:, 0] = 0
343
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
344
+ tmp_over_one = torch.cumsum(
345
+ rad_values, 1
346
+ ) # % 1 #####%1意味着后面的cumsum无法再优化
347
+ tmp_over_one *= upp
348
+ tmp_over_one = F.interpolate(
349
+ tmp_over_one.transpose(2, 1),
350
+ scale_factor=upp,
351
+ mode="linear",
352
+ align_corners=True,
353
+ ).transpose(2, 1)
354
+ rad_values = F.interpolate(
355
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
356
+ ).transpose(
357
+ 2, 1
358
+ ) #######
359
+ tmp_over_one %= 1
360
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
361
+ cumsum_shift = torch.zeros_like(rad_values)
362
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
363
+ sine_waves = torch.sin(
364
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
365
+ )
366
+ sine_waves = sine_waves * self.sine_amp
367
+ uv = self._f02uv(f0)
368
+ uv = F.interpolate(
369
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
370
+ ).transpose(2, 1)
371
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
372
+ noise = noise_amp * torch.randn_like(sine_waves)
373
+ sine_waves = sine_waves * uv + noise
374
+ return sine_waves, uv, noise
375
+
376
+
377
+ class SourceModuleHnNSF(torch.nn.Module):
378
+ """SourceModule for hn-nsf
379
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
380
+ add_noise_std=0.003, voiced_threshod=0)
381
+ sampling_rate: sampling_rate in Hz
382
+ harmonic_num: number of harmonic above F0 (default: 0)
383
+ sine_amp: amplitude of sine source signal (default: 0.1)
384
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
385
+ note that amplitude of noise in unvoiced is decided
386
+ by sine_amp
387
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
388
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
389
+ F0_sampled (batchsize, length, 1)
390
+ Sine_source (batchsize, length, 1)
391
+ noise_source (batchsize, length 1)
392
+ uv (batchsize, length, 1)
393
+ """
394
+
395
+ def __init__(
396
+ self,
397
+ sampling_rate,
398
+ harmonic_num=0,
399
+ sine_amp=0.1,
400
+ add_noise_std=0.003,
401
+ voiced_threshod=0,
402
+ is_half=True,
403
+ ):
404
+ super(SourceModuleHnNSF, self).__init__()
405
+
406
+ self.sine_amp = sine_amp
407
+ self.noise_std = add_noise_std
408
+ self.is_half = is_half
409
+ # to produce sine waveforms
410
+ self.l_sin_gen = SineGen(
411
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
412
+ )
413
+
414
+ # to merge source harmonics into a single excitation
415
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
416
+ self.l_tanh = torch.nn.Tanh()
417
+
418
+ def forward(self, x, upp=None):
419
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
420
+ if self.is_half:
421
+ sine_wavs = sine_wavs.half()
422
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
423
+ return sine_merge, None, None # noise, uv
424
+
425
+
426
+ class GeneratorNSF(torch.nn.Module):
427
+ def __init__(
428
+ self,
429
+ initial_channel,
430
+ resblock,
431
+ resblock_kernel_sizes,
432
+ resblock_dilation_sizes,
433
+ upsample_rates,
434
+ upsample_initial_channel,
435
+ upsample_kernel_sizes,
436
+ gin_channels,
437
+ sr,
438
+ is_half=False,
439
+ ):
440
+ super(GeneratorNSF, self).__init__()
441
+ self.num_kernels = len(resblock_kernel_sizes)
442
+ self.num_upsamples = len(upsample_rates)
443
+
444
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
445
+ self.m_source = SourceModuleHnNSF(
446
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
447
+ )
448
+ self.noise_convs = nn.ModuleList()
449
+ self.conv_pre = Conv1d(
450
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
451
+ )
452
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
453
+
454
+ self.ups = nn.ModuleList()
455
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
456
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
457
+ self.ups.append(
458
+ weight_norm(
459
+ ConvTranspose1d(
460
+ upsample_initial_channel // (2**i),
461
+ upsample_initial_channel // (2 ** (i + 1)),
462
+ k,
463
+ u,
464
+ padding=(k - u) // 2,
465
+ )
466
+ )
467
+ )
468
+ if i + 1 < len(upsample_rates):
469
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
470
+ self.noise_convs.append(
471
+ Conv1d(
472
+ 1,
473
+ c_cur,
474
+ kernel_size=stride_f0 * 2,
475
+ stride=stride_f0,
476
+ padding=stride_f0 // 2,
477
+ )
478
+ )
479
+ else:
480
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
481
+
482
+ self.resblocks = nn.ModuleList()
483
+ for i in range(len(self.ups)):
484
+ ch = upsample_initial_channel // (2 ** (i + 1))
485
+ for j, (k, d) in enumerate(
486
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
487
+ ):
488
+ self.resblocks.append(resblock(ch, k, d))
489
+
490
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
491
+ self.ups.apply(init_weights)
492
+
493
+ if gin_channels != 0:
494
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
495
+
496
+ self.upp = np.prod(upsample_rates)
497
+
498
+ def forward(self, x, f0, g=None):
499
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
500
+ har_source = har_source.transpose(1, 2)
501
+ x = self.conv_pre(x)
502
+ if g is not None:
503
+ x = x + self.cond(g)
504
+
505
+ for i in range(self.num_upsamples):
506
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
507
+ x = self.ups[i](x)
508
+ x_source = self.noise_convs[i](har_source)
509
+ x = x + x_source
510
+ xs = None
511
+ for j in range(self.num_kernels):
512
+ if xs is None:
513
+ xs = self.resblocks[i * self.num_kernels + j](x)
514
+ else:
515
+ xs += self.resblocks[i * self.num_kernels + j](x)
516
+ x = xs / self.num_kernels
517
+ x = F.leaky_relu(x)
518
+ x = self.conv_post(x)
519
+ x = torch.tanh(x)
520
+ return x
521
+
522
+ def remove_weight_norm(self):
523
+ for l in self.ups:
524
+ remove_weight_norm(l)
525
+ for l in self.resblocks:
526
+ l.remove_weight_norm()
527
+
528
+
529
+ sr2sr = {
530
+ "32k": 32000,
531
+ "40k": 40000,
532
+ "48k": 48000,
533
+ }
534
+
535
+
536
+ class SynthesizerTrnMs256NSFsid(nn.Module):
537
+ def __init__(
538
+ self,
539
+ spec_channels,
540
+ segment_size,
541
+ inter_channels,
542
+ hidden_channels,
543
+ filter_channels,
544
+ n_heads,
545
+ n_layers,
546
+ kernel_size,
547
+ p_dropout,
548
+ resblock,
549
+ resblock_kernel_sizes,
550
+ resblock_dilation_sizes,
551
+ upsample_rates,
552
+ upsample_initial_channel,
553
+ upsample_kernel_sizes,
554
+ spk_embed_dim,
555
+ gin_channels,
556
+ sr,
557
+ **kwargs
558
+ ):
559
+ super().__init__()
560
+ if type(sr) == type("strr"):
561
+ sr = sr2sr[sr]
562
+ self.spec_channels = spec_channels
563
+ self.inter_channels = inter_channels
564
+ self.hidden_channels = hidden_channels
565
+ self.filter_channels = filter_channels
566
+ self.n_heads = n_heads
567
+ self.n_layers = n_layers
568
+ self.kernel_size = kernel_size
569
+ self.p_dropout = p_dropout
570
+ self.resblock = resblock
571
+ self.resblock_kernel_sizes = resblock_kernel_sizes
572
+ self.resblock_dilation_sizes = resblock_dilation_sizes
573
+ self.upsample_rates = upsample_rates
574
+ self.upsample_initial_channel = upsample_initial_channel
575
+ self.upsample_kernel_sizes = upsample_kernel_sizes
576
+ self.segment_size = segment_size
577
+ self.gin_channels = gin_channels
578
+ # self.hop_length = hop_length#
579
+ self.spk_embed_dim = spk_embed_dim
580
+ self.enc_p = TextEncoder256(
581
+ inter_channels,
582
+ hidden_channels,
583
+ filter_channels,
584
+ n_heads,
585
+ n_layers,
586
+ kernel_size,
587
+ p_dropout,
588
+ )
589
+ self.dec = GeneratorNSF(
590
+ inter_channels,
591
+ resblock,
592
+ resblock_kernel_sizes,
593
+ resblock_dilation_sizes,
594
+ upsample_rates,
595
+ upsample_initial_channel,
596
+ upsample_kernel_sizes,
597
+ gin_channels=gin_channels,
598
+ sr=sr,
599
+ is_half=kwargs["is_half"],
600
+ )
601
+ self.enc_q = PosteriorEncoder(
602
+ spec_channels,
603
+ inter_channels,
604
+ hidden_channels,
605
+ 5,
606
+ 1,
607
+ 16,
608
+ gin_channels=gin_channels,
609
+ )
610
+ self.flow = ResidualCouplingBlock(
611
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
612
+ )
613
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
614
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
615
+
616
+ def remove_weight_norm(self):
617
+ self.dec.remove_weight_norm()
618
+ self.flow.remove_weight_norm()
619
+ self.enc_q.remove_weight_norm()
620
+
621
+ def forward(
622
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
623
+ ): # 这里ds是id,[bs,1]
624
+ # print(1,pitch.shape)#[bs,t]
625
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
626
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
627
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
628
+ z_p = self.flow(z, y_mask, g=g)
629
+ z_slice, ids_slice = commons.rand_slice_segments(
630
+ z, y_lengths, self.segment_size
631
+ )
632
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
633
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
634
+ # print(-2,pitchf.shape,z_slice.shape)
635
+ o = self.dec(z_slice, pitchf, g=g)
636
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
637
+
638
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
639
+ g = self.emb_g(sid).unsqueeze(-1)
640
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
641
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
642
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
643
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
644
+ return o, x_mask, (z, z_p, m_p, logs_p)
645
+
646
+
647
+ class SynthesizerTrnMs768NSFsid(nn.Module):
648
+ def __init__(
649
+ self,
650
+ spec_channels,
651
+ segment_size,
652
+ inter_channels,
653
+ hidden_channels,
654
+ filter_channels,
655
+ n_heads,
656
+ n_layers,
657
+ kernel_size,
658
+ p_dropout,
659
+ resblock,
660
+ resblock_kernel_sizes,
661
+ resblock_dilation_sizes,
662
+ upsample_rates,
663
+ upsample_initial_channel,
664
+ upsample_kernel_sizes,
665
+ spk_embed_dim,
666
+ gin_channels,
667
+ sr,
668
+ **kwargs
669
+ ):
670
+ super().__init__()
671
+ if type(sr) == type("strr"):
672
+ sr = sr2sr[sr]
673
+ self.spec_channels = spec_channels
674
+ self.inter_channels = inter_channels
675
+ self.hidden_channels = hidden_channels
676
+ self.filter_channels = filter_channels
677
+ self.n_heads = n_heads
678
+ self.n_layers = n_layers
679
+ self.kernel_size = kernel_size
680
+ self.p_dropout = p_dropout
681
+ self.resblock = resblock
682
+ self.resblock_kernel_sizes = resblock_kernel_sizes
683
+ self.resblock_dilation_sizes = resblock_dilation_sizes
684
+ self.upsample_rates = upsample_rates
685
+ self.upsample_initial_channel = upsample_initial_channel
686
+ self.upsample_kernel_sizes = upsample_kernel_sizes
687
+ self.segment_size = segment_size
688
+ self.gin_channels = gin_channels
689
+ # self.hop_length = hop_length#
690
+ self.spk_embed_dim = spk_embed_dim
691
+ self.enc_p = TextEncoder768(
692
+ inter_channels,
693
+ hidden_channels,
694
+ filter_channels,
695
+ n_heads,
696
+ n_layers,
697
+ kernel_size,
698
+ p_dropout,
699
+ )
700
+ self.dec = GeneratorNSF(
701
+ inter_channels,
702
+ resblock,
703
+ resblock_kernel_sizes,
704
+ resblock_dilation_sizes,
705
+ upsample_rates,
706
+ upsample_initial_channel,
707
+ upsample_kernel_sizes,
708
+ gin_channels=gin_channels,
709
+ sr=sr,
710
+ is_half=kwargs["is_half"],
711
+ )
712
+ self.enc_q = PosteriorEncoder(
713
+ spec_channels,
714
+ inter_channels,
715
+ hidden_channels,
716
+ 5,
717
+ 1,
718
+ 16,
719
+ gin_channels=gin_channels,
720
+ )
721
+ self.flow = ResidualCouplingBlock(
722
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
723
+ )
724
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
725
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
726
+
727
+ def remove_weight_norm(self):
728
+ self.dec.remove_weight_norm()
729
+ self.flow.remove_weight_norm()
730
+ self.enc_q.remove_weight_norm()
731
+
732
+ def forward(
733
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
734
+ ): # 这里ds是id,[bs,1]
735
+ # print(1,pitch.shape)#[bs,t]
736
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
737
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
738
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
739
+ z_p = self.flow(z, y_mask, g=g)
740
+ z_slice, ids_slice = commons.rand_slice_segments(
741
+ z, y_lengths, self.segment_size
742
+ )
743
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
744
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
745
+ # print(-2,pitchf.shape,z_slice.shape)
746
+ o = self.dec(z_slice, pitchf, g=g)
747
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
748
+
749
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
750
+ g = self.emb_g(sid).unsqueeze(-1)
751
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
752
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
753
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
754
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
755
+ return o, x_mask, (z, z_p, m_p, logs_p)
756
+
757
+
758
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
759
+ def __init__(
760
+ self,
761
+ spec_channels,
762
+ segment_size,
763
+ inter_channels,
764
+ hidden_channels,
765
+ filter_channels,
766
+ n_heads,
767
+ n_layers,
768
+ kernel_size,
769
+ p_dropout,
770
+ resblock,
771
+ resblock_kernel_sizes,
772
+ resblock_dilation_sizes,
773
+ upsample_rates,
774
+ upsample_initial_channel,
775
+ upsample_kernel_sizes,
776
+ spk_embed_dim,
777
+ gin_channels,
778
+ sr=None,
779
+ **kwargs
780
+ ):
781
+ super().__init__()
782
+ self.spec_channels = spec_channels
783
+ self.inter_channels = inter_channels
784
+ self.hidden_channels = hidden_channels
785
+ self.filter_channels = filter_channels
786
+ self.n_heads = n_heads
787
+ self.n_layers = n_layers
788
+ self.kernel_size = kernel_size
789
+ self.p_dropout = p_dropout
790
+ self.resblock = resblock
791
+ self.resblock_kernel_sizes = resblock_kernel_sizes
792
+ self.resblock_dilation_sizes = resblock_dilation_sizes
793
+ self.upsample_rates = upsample_rates
794
+ self.upsample_initial_channel = upsample_initial_channel
795
+ self.upsample_kernel_sizes = upsample_kernel_sizes
796
+ self.segment_size = segment_size
797
+ self.gin_channels = gin_channels
798
+ # self.hop_length = hop_length#
799
+ self.spk_embed_dim = spk_embed_dim
800
+ self.enc_p = TextEncoder256(
801
+ inter_channels,
802
+ hidden_channels,
803
+ filter_channels,
804
+ n_heads,
805
+ n_layers,
806
+ kernel_size,
807
+ p_dropout,
808
+ f0=False,
809
+ )
810
+ self.dec = Generator(
811
+ inter_channels,
812
+ resblock,
813
+ resblock_kernel_sizes,
814
+ resblock_dilation_sizes,
815
+ upsample_rates,
816
+ upsample_initial_channel,
817
+ upsample_kernel_sizes,
818
+ gin_channels=gin_channels,
819
+ )
820
+ self.enc_q = PosteriorEncoder(
821
+ spec_channels,
822
+ inter_channels,
823
+ hidden_channels,
824
+ 5,
825
+ 1,
826
+ 16,
827
+ gin_channels=gin_channels,
828
+ )
829
+ self.flow = ResidualCouplingBlock(
830
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
831
+ )
832
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
833
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
834
+
835
+ def remove_weight_norm(self):
836
+ self.dec.remove_weight_norm()
837
+ self.flow.remove_weight_norm()
838
+ self.enc_q.remove_weight_norm()
839
+
840
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
841
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
842
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
843
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
844
+ z_p = self.flow(z, y_mask, g=g)
845
+ z_slice, ids_slice = commons.rand_slice_segments(
846
+ z, y_lengths, self.segment_size
847
+ )
848
+ o = self.dec(z_slice, g=g)
849
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
850
+
851
+ def infer(self, phone, phone_lengths, sid, max_len=None):
852
+ g = self.emb_g(sid).unsqueeze(-1)
853
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
854
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
855
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
856
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
857
+ return o, x_mask, (z, z_p, m_p, logs_p)
858
+
859
+
860
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
861
+ def __init__(
862
+ self,
863
+ spec_channels,
864
+ segment_size,
865
+ inter_channels,
866
+ hidden_channels,
867
+ filter_channels,
868
+ n_heads,
869
+ n_layers,
870
+ kernel_size,
871
+ p_dropout,
872
+ resblock,
873
+ resblock_kernel_sizes,
874
+ resblock_dilation_sizes,
875
+ upsample_rates,
876
+ upsample_initial_channel,
877
+ upsample_kernel_sizes,
878
+ spk_embed_dim,
879
+ gin_channels,
880
+ sr=None,
881
+ **kwargs
882
+ ):
883
+ super().__init__()
884
+ self.spec_channels = spec_channels
885
+ self.inter_channels = inter_channels
886
+ self.hidden_channels = hidden_channels
887
+ self.filter_channels = filter_channels
888
+ self.n_heads = n_heads
889
+ self.n_layers = n_layers
890
+ self.kernel_size = kernel_size
891
+ self.p_dropout = p_dropout
892
+ self.resblock = resblock
893
+ self.resblock_kernel_sizes = resblock_kernel_sizes
894
+ self.resblock_dilation_sizes = resblock_dilation_sizes
895
+ self.upsample_rates = upsample_rates
896
+ self.upsample_initial_channel = upsample_initial_channel
897
+ self.upsample_kernel_sizes = upsample_kernel_sizes
898
+ self.segment_size = segment_size
899
+ self.gin_channels = gin_channels
900
+ # self.hop_length = hop_length#
901
+ self.spk_embed_dim = spk_embed_dim
902
+ self.enc_p = TextEncoder768(
903
+ inter_channels,
904
+ hidden_channels,
905
+ filter_channels,
906
+ n_heads,
907
+ n_layers,
908
+ kernel_size,
909
+ p_dropout,
910
+ f0=False,
911
+ )
912
+ self.dec = Generator(
913
+ inter_channels,
914
+ resblock,
915
+ resblock_kernel_sizes,
916
+ resblock_dilation_sizes,
917
+ upsample_rates,
918
+ upsample_initial_channel,
919
+ upsample_kernel_sizes,
920
+ gin_channels=gin_channels,
921
+ )
922
+ self.enc_q = PosteriorEncoder(
923
+ spec_channels,
924
+ inter_channels,
925
+ hidden_channels,
926
+ 5,
927
+ 1,
928
+ 16,
929
+ gin_channels=gin_channels,
930
+ )
931
+ self.flow = ResidualCouplingBlock(
932
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
933
+ )
934
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
935
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
936
+
937
+ def remove_weight_norm(self):
938
+ self.dec.remove_weight_norm()
939
+ self.flow.remove_weight_norm()
940
+ self.enc_q.remove_weight_norm()
941
+
942
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
943
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
944
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
945
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
946
+ z_p = self.flow(z, y_mask, g=g)
947
+ z_slice, ids_slice = commons.rand_slice_segments(
948
+ z, y_lengths, self.segment_size
949
+ )
950
+ o = self.dec(z_slice, g=g)
951
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
952
+
953
+ def infer(self, phone, phone_lengths, sid, max_len=None):
954
+ g = self.emb_g(sid).unsqueeze(-1)
955
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
956
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
957
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
958
+ o = self.dec((z * x_mask)[:, :, :max_len], g=g)
959
+ return o, x_mask, (z, z_p, m_p, logs_p)
960
+
961
+
962
+ class MultiPeriodDiscriminator(torch.nn.Module):
963
+ def __init__(self, use_spectral_norm=False):
964
+ super(MultiPeriodDiscriminator, self).__init__()
965
+ periods = [2, 3, 5, 7, 11, 17]
966
+ # periods = [3, 5, 7, 11, 17, 23, 37]
967
+
968
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
969
+ discs = discs + [
970
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
971
+ ]
972
+ self.discriminators = nn.ModuleList(discs)
973
+
974
+ def forward(self, y, y_hat):
975
+ y_d_rs = [] #
976
+ y_d_gs = []
977
+ fmap_rs = []
978
+ fmap_gs = []
979
+ for i, d in enumerate(self.discriminators):
980
+ y_d_r, fmap_r = d(y)
981
+ y_d_g, fmap_g = d(y_hat)
982
+ # for j in range(len(fmap_r)):
983
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
984
+ y_d_rs.append(y_d_r)
985
+ y_d_gs.append(y_d_g)
986
+ fmap_rs.append(fmap_r)
987
+ fmap_gs.append(fmap_g)
988
+
989
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
990
+
991
+
992
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
993
+ def __init__(self, use_spectral_norm=False):
994
+ super(MultiPeriodDiscriminatorV2, self).__init__()
995
+ # periods = [2, 3, 5, 7, 11, 17]
996
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
997
+
998
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
999
+ discs = discs + [
1000
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1001
+ ]
1002
+ self.discriminators = nn.ModuleList(discs)
1003
+
1004
+ def forward(self, y, y_hat):
1005
+ y_d_rs = [] #
1006
+ y_d_gs = []
1007
+ fmap_rs = []
1008
+ fmap_gs = []
1009
+ for i, d in enumerate(self.discriminators):
1010
+ y_d_r, fmap_r = d(y)
1011
+ y_d_g, fmap_g = d(y_hat)
1012
+ # for j in range(len(fmap_r)):
1013
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1014
+ y_d_rs.append(y_d_r)
1015
+ y_d_gs.append(y_d_g)
1016
+ fmap_rs.append(fmap_r)
1017
+ fmap_gs.append(fmap_g)
1018
+
1019
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1020
+
1021
+
1022
+ class DiscriminatorS(torch.nn.Module):
1023
+ def __init__(self, use_spectral_norm=False):
1024
+ super(DiscriminatorS, self).__init__()
1025
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1026
+ self.convs = nn.ModuleList(
1027
+ [
1028
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1029
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1030
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1031
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1032
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1033
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1034
+ ]
1035
+ )
1036
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1037
+
1038
+ def forward(self, x):
1039
+ fmap = []
1040
+
1041
+ for l in self.convs:
1042
+ x = l(x)
1043
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1044
+ fmap.append(x)
1045
+ x = self.conv_post(x)
1046
+ fmap.append(x)
1047
+ x = torch.flatten(x, 1, -1)
1048
+
1049
+ return x, fmap
1050
+
1051
+
1052
+ class DiscriminatorP(torch.nn.Module):
1053
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1054
+ super(DiscriminatorP, self).__init__()
1055
+ self.period = period
1056
+ self.use_spectral_norm = use_spectral_norm
1057
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1058
+ self.convs = nn.ModuleList(
1059
+ [
1060
+ norm_f(
1061
+ Conv2d(
1062
+ 1,
1063
+ 32,
1064
+ (kernel_size, 1),
1065
+ (stride, 1),
1066
+ padding=(get_padding(kernel_size, 1), 0),
1067
+ )
1068
+ ),
1069
+ norm_f(
1070
+ Conv2d(
1071
+ 32,
1072
+ 128,
1073
+ (kernel_size, 1),
1074
+ (stride, 1),
1075
+ padding=(get_padding(kernel_size, 1), 0),
1076
+ )
1077
+ ),
1078
+ norm_f(
1079
+ Conv2d(
1080
+ 128,
1081
+ 512,
1082
+ (kernel_size, 1),
1083
+ (stride, 1),
1084
+ padding=(get_padding(kernel_size, 1), 0),
1085
+ )
1086
+ ),
1087
+ norm_f(
1088
+ Conv2d(
1089
+ 512,
1090
+ 1024,
1091
+ (kernel_size, 1),
1092
+ (stride, 1),
1093
+ padding=(get_padding(kernel_size, 1), 0),
1094
+ )
1095
+ ),
1096
+ norm_f(
1097
+ Conv2d(
1098
+ 1024,
1099
+ 1024,
1100
+ (kernel_size, 1),
1101
+ 1,
1102
+ padding=(get_padding(kernel_size, 1), 0),
1103
+ )
1104
+ ),
1105
+ ]
1106
+ )
1107
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1108
+
1109
+ def forward(self, x):
1110
+ fmap = []
1111
+
1112
+ # 1d to 2d
1113
+ b, c, t = x.shape
1114
+ if t % self.period != 0: # pad first
1115
+ n_pad = self.period - (t % self.period)
1116
+ x = F.pad(x, (0, n_pad), "reflect")
1117
+ t = t + n_pad
1118
+ x = x.view(b, c, t // self.period, self.period)
1119
+
1120
+ for l in self.convs:
1121
+ x = l(x)
1122
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1123
+ fmap.append(x)
1124
+ x = self.conv_post(x)
1125
+ fmap.append(x)
1126
+ x = torch.flatten(x, 1, -1)
1127
+
1128
+ return x, fmap
src/vc/infer_pack/models_onnx.py ADDED
@@ -0,0 +1,822 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from vc.infer_pack import modules
7
+ from vc.infer_pack import attentions
8
+ from vc.infer_pack import commons
9
+ from vc.infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from vc.infer_pack.commons import init_weights
13
+ import numpy as np
14
+ from vc.infer_pack import commons
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (
337
+ f0_buf / self.sampling_rate
338
+ ) % 1 ###%1意味着n_har的乘积无法后处理优化
339
+ rand_ini = torch.rand(
340
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
341
+ )
342
+ rand_ini[:, 0] = 0
343
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
344
+ tmp_over_one = torch.cumsum(
345
+ rad_values, 1
346
+ ) # % 1 #####%1意味着后面的cumsum无法再优化
347
+ tmp_over_one *= upp
348
+ tmp_over_one = F.interpolate(
349
+ tmp_over_one.transpose(2, 1),
350
+ scale_factor=upp,
351
+ mode="linear",
352
+ align_corners=True,
353
+ ).transpose(2, 1)
354
+ rad_values = F.interpolate(
355
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
356
+ ).transpose(
357
+ 2, 1
358
+ ) #######
359
+ tmp_over_one %= 1
360
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
361
+ cumsum_shift = torch.zeros_like(rad_values)
362
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
363
+ sine_waves = torch.sin(
364
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
365
+ )
366
+ sine_waves = sine_waves * self.sine_amp
367
+ uv = self._f02uv(f0)
368
+ uv = F.interpolate(
369
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
370
+ ).transpose(2, 1)
371
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
372
+ noise = noise_amp * torch.randn_like(sine_waves)
373
+ sine_waves = sine_waves * uv + noise
374
+ return sine_waves, uv, noise
375
+
376
+
377
+ class SourceModuleHnNSF(torch.nn.Module):
378
+ """SourceModule for hn-nsf
379
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
380
+ add_noise_std=0.003, voiced_threshod=0)
381
+ sampling_rate: sampling_rate in Hz
382
+ harmonic_num: number of harmonic above F0 (default: 0)
383
+ sine_amp: amplitude of sine source signal (default: 0.1)
384
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
385
+ note that amplitude of noise in unvoiced is decided
386
+ by sine_amp
387
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
388
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
389
+ F0_sampled (batchsize, length, 1)
390
+ Sine_source (batchsize, length, 1)
391
+ noise_source (batchsize, length 1)
392
+ uv (batchsize, length, 1)
393
+ """
394
+
395
+ def __init__(
396
+ self,
397
+ sampling_rate,
398
+ harmonic_num=0,
399
+ sine_amp=0.1,
400
+ add_noise_std=0.003,
401
+ voiced_threshod=0,
402
+ is_half=True,
403
+ ):
404
+ super(SourceModuleHnNSF, self).__init__()
405
+
406
+ self.sine_amp = sine_amp
407
+ self.noise_std = add_noise_std
408
+ self.is_half = is_half
409
+ # to produce sine waveforms
410
+ self.l_sin_gen = SineGen(
411
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
412
+ )
413
+
414
+ # to merge source harmonics into a single excitation
415
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
416
+ self.l_tanh = torch.nn.Tanh()
417
+
418
+ def forward(self, x, upp=None):
419
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
420
+ if self.is_half:
421
+ sine_wavs = sine_wavs.half()
422
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
423
+ return sine_merge, None, None # noise, uv
424
+
425
+
426
+ class GeneratorNSF(torch.nn.Module):
427
+ def __init__(
428
+ self,
429
+ initial_channel,
430
+ resblock,
431
+ resblock_kernel_sizes,
432
+ resblock_dilation_sizes,
433
+ upsample_rates,
434
+ upsample_initial_channel,
435
+ upsample_kernel_sizes,
436
+ gin_channels,
437
+ sr,
438
+ is_half=False,
439
+ ):
440
+ super(GeneratorNSF, self).__init__()
441
+ self.num_kernels = len(resblock_kernel_sizes)
442
+ self.num_upsamples = len(upsample_rates)
443
+
444
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
445
+ self.m_source = SourceModuleHnNSF(
446
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
447
+ )
448
+ self.noise_convs = nn.ModuleList()
449
+ self.conv_pre = Conv1d(
450
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
451
+ )
452
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
453
+
454
+ self.ups = nn.ModuleList()
455
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
456
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
457
+ self.ups.append(
458
+ weight_norm(
459
+ ConvTranspose1d(
460
+ upsample_initial_channel // (2**i),
461
+ upsample_initial_channel // (2 ** (i + 1)),
462
+ k,
463
+ u,
464
+ padding=(k - u) // 2,
465
+ )
466
+ )
467
+ )
468
+ if i + 1 < len(upsample_rates):
469
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
470
+ self.noise_convs.append(
471
+ Conv1d(
472
+ 1,
473
+ c_cur,
474
+ kernel_size=stride_f0 * 2,
475
+ stride=stride_f0,
476
+ padding=stride_f0 // 2,
477
+ )
478
+ )
479
+ else:
480
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
481
+
482
+ self.resblocks = nn.ModuleList()
483
+ for i in range(len(self.ups)):
484
+ ch = upsample_initial_channel // (2 ** (i + 1))
485
+ for j, (k, d) in enumerate(
486
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
487
+ ):
488
+ self.resblocks.append(resblock(ch, k, d))
489
+
490
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
491
+ self.ups.apply(init_weights)
492
+
493
+ if gin_channels != 0:
494
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
495
+
496
+ self.upp = np.prod(upsample_rates)
497
+
498
+ def forward(self, x, f0, g=None):
499
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
500
+ har_source = har_source.transpose(1, 2)
501
+ x = self.conv_pre(x)
502
+ if g is not None:
503
+ x = x + self.cond(g)
504
+
505
+ for i in range(self.num_upsamples):
506
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
507
+ x = self.ups[i](x)
508
+ x_source = self.noise_convs[i](har_source)
509
+ x = x + x_source
510
+ xs = None
511
+ for j in range(self.num_kernels):
512
+ if xs is None:
513
+ xs = self.resblocks[i * self.num_kernels + j](x)
514
+ else:
515
+ xs += self.resblocks[i * self.num_kernels + j](x)
516
+ x = xs / self.num_kernels
517
+ x = F.leaky_relu(x)
518
+ x = self.conv_post(x)
519
+ x = torch.tanh(x)
520
+ return x
521
+
522
+ def remove_weight_norm(self):
523
+ for l in self.ups:
524
+ remove_weight_norm(l)
525
+ for l in self.resblocks:
526
+ l.remove_weight_norm()
527
+
528
+
529
+ sr2sr = {
530
+ "32k": 32000,
531
+ "40k": 40000,
532
+ "48k": 48000,
533
+ }
534
+
535
+
536
+ class SynthesizerTrnMsNSFsidM(nn.Module):
537
+ def __init__(
538
+ self,
539
+ spec_channels,
540
+ segment_size,
541
+ inter_channels,
542
+ hidden_channels,
543
+ filter_channels,
544
+ n_heads,
545
+ n_layers,
546
+ kernel_size,
547
+ p_dropout,
548
+ resblock,
549
+ resblock_kernel_sizes,
550
+ resblock_dilation_sizes,
551
+ upsample_rates,
552
+ upsample_initial_channel,
553
+ upsample_kernel_sizes,
554
+ spk_embed_dim,
555
+ gin_channels,
556
+ sr,
557
+ **kwargs
558
+ ):
559
+ super().__init__()
560
+ if type(sr) == type("strr"):
561
+ sr = sr2sr[sr]
562
+ self.spec_channels = spec_channels
563
+ self.inter_channels = inter_channels
564
+ self.hidden_channels = hidden_channels
565
+ self.filter_channels = filter_channels
566
+ self.n_heads = n_heads
567
+ self.n_layers = n_layers
568
+ self.kernel_size = kernel_size
569
+ self.p_dropout = p_dropout
570
+ self.resblock = resblock
571
+ self.resblock_kernel_sizes = resblock_kernel_sizes
572
+ self.resblock_dilation_sizes = resblock_dilation_sizes
573
+ self.upsample_rates = upsample_rates
574
+ self.upsample_initial_channel = upsample_initial_channel
575
+ self.upsample_kernel_sizes = upsample_kernel_sizes
576
+ self.segment_size = segment_size
577
+ self.gin_channels = gin_channels
578
+ # self.hop_length = hop_length#
579
+ self.spk_embed_dim = spk_embed_dim
580
+ if self.gin_channels == 256:
581
+ self.enc_p = TextEncoder256(
582
+ inter_channels,
583
+ hidden_channels,
584
+ filter_channels,
585
+ n_heads,
586
+ n_layers,
587
+ kernel_size,
588
+ p_dropout,
589
+ )
590
+ else:
591
+ self.enc_p = TextEncoder768(
592
+ inter_channels,
593
+ hidden_channels,
594
+ filter_channels,
595
+ n_heads,
596
+ n_layers,
597
+ kernel_size,
598
+ p_dropout,
599
+ )
600
+ self.dec = GeneratorNSF(
601
+ inter_channels,
602
+ resblock,
603
+ resblock_kernel_sizes,
604
+ resblock_dilation_sizes,
605
+ upsample_rates,
606
+ upsample_initial_channel,
607
+ upsample_kernel_sizes,
608
+ gin_channels=gin_channels,
609
+ sr=sr,
610
+ is_half=kwargs["is_half"],
611
+ )
612
+ self.enc_q = PosteriorEncoder(
613
+ spec_channels,
614
+ inter_channels,
615
+ hidden_channels,
616
+ 5,
617
+ 1,
618
+ 16,
619
+ gin_channels=gin_channels,
620
+ )
621
+ self.flow = ResidualCouplingBlock(
622
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
623
+ )
624
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
625
+ self.speaker_map = None
626
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
627
+
628
+ def remove_weight_norm(self):
629
+ self.dec.remove_weight_norm()
630
+ self.flow.remove_weight_norm()
631
+ self.enc_q.remove_weight_norm()
632
+
633
+ def construct_spkmixmap(self, n_speaker):
634
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
635
+ for i in range(n_speaker):
636
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
637
+ self.speaker_map = self.speaker_map.unsqueeze(0)
638
+
639
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
640
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
641
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
642
+ g = g * self.speaker_map # [N, S, B, 1, H]
643
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
644
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
645
+ else:
646
+ g = g.unsqueeze(0)
647
+ g = self.emb_g(g).transpose(1, 2)
648
+
649
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
650
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
651
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
652
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
653
+ return o
654
+
655
+
656
+ class MultiPeriodDiscriminator(torch.nn.Module):
657
+ def __init__(self, use_spectral_norm=False):
658
+ super(MultiPeriodDiscriminator, self).__init__()
659
+ periods = [2, 3, 5, 7, 11, 17]
660
+ # periods = [3, 5, 7, 11, 17, 23, 37]
661
+
662
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
663
+ discs = discs + [
664
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
665
+ ]
666
+ self.discriminators = nn.ModuleList(discs)
667
+
668
+ def forward(self, y, y_hat):
669
+ y_d_rs = [] #
670
+ y_d_gs = []
671
+ fmap_rs = []
672
+ fmap_gs = []
673
+ for i, d in enumerate(self.discriminators):
674
+ y_d_r, fmap_r = d(y)
675
+ y_d_g, fmap_g = d(y_hat)
676
+ # for j in range(len(fmap_r)):
677
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
678
+ y_d_rs.append(y_d_r)
679
+ y_d_gs.append(y_d_g)
680
+ fmap_rs.append(fmap_r)
681
+ fmap_gs.append(fmap_g)
682
+
683
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
684
+
685
+
686
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
687
+ def __init__(self, use_spectral_norm=False):
688
+ super(MultiPeriodDiscriminatorV2, self).__init__()
689
+ # periods = [2, 3, 5, 7, 11, 17]
690
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
691
+
692
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
693
+ discs = discs + [
694
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
695
+ ]
696
+ self.discriminators = nn.ModuleList(discs)
697
+
698
+ def forward(self, y, y_hat):
699
+ y_d_rs = [] #
700
+ y_d_gs = []
701
+ fmap_rs = []
702
+ fmap_gs = []
703
+ for i, d in enumerate(self.discriminators):
704
+ y_d_r, fmap_r = d(y)
705
+ y_d_g, fmap_g = d(y_hat)
706
+ # for j in range(len(fmap_r)):
707
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
708
+ y_d_rs.append(y_d_r)
709
+ y_d_gs.append(y_d_g)
710
+ fmap_rs.append(fmap_r)
711
+ fmap_gs.append(fmap_g)
712
+
713
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
714
+
715
+
716
+ class DiscriminatorS(torch.nn.Module):
717
+ def __init__(self, use_spectral_norm=False):
718
+ super(DiscriminatorS, self).__init__()
719
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
720
+ self.convs = nn.ModuleList(
721
+ [
722
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
723
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
724
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
725
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
726
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
727
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
728
+ ]
729
+ )
730
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
731
+
732
+ def forward(self, x):
733
+ fmap = []
734
+
735
+ for l in self.convs:
736
+ x = l(x)
737
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
738
+ fmap.append(x)
739
+ x = self.conv_post(x)
740
+ fmap.append(x)
741
+ x = torch.flatten(x, 1, -1)
742
+
743
+ return x, fmap
744
+
745
+
746
+ class DiscriminatorP(torch.nn.Module):
747
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
748
+ super(DiscriminatorP, self).__init__()
749
+ self.period = period
750
+ self.use_spectral_norm = use_spectral_norm
751
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
752
+ self.convs = nn.ModuleList(
753
+ [
754
+ norm_f(
755
+ Conv2d(
756
+ 1,
757
+ 32,
758
+ (kernel_size, 1),
759
+ (stride, 1),
760
+ padding=(get_padding(kernel_size, 1), 0),
761
+ )
762
+ ),
763
+ norm_f(
764
+ Conv2d(
765
+ 32,
766
+ 128,
767
+ (kernel_size, 1),
768
+ (stride, 1),
769
+ padding=(get_padding(kernel_size, 1), 0),
770
+ )
771
+ ),
772
+ norm_f(
773
+ Conv2d(
774
+ 128,
775
+ 512,
776
+ (kernel_size, 1),
777
+ (stride, 1),
778
+ padding=(get_padding(kernel_size, 1), 0),
779
+ )
780
+ ),
781
+ norm_f(
782
+ Conv2d(
783
+ 512,
784
+ 1024,
785
+ (kernel_size, 1),
786
+ (stride, 1),
787
+ padding=(get_padding(kernel_size, 1), 0),
788
+ )
789
+ ),
790
+ norm_f(
791
+ Conv2d(
792
+ 1024,
793
+ 1024,
794
+ (kernel_size, 1),
795
+ 1,
796
+ padding=(get_padding(kernel_size, 1), 0),
797
+ )
798
+ ),
799
+ ]
800
+ )
801
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
802
+
803
+ def forward(self, x):
804
+ fmap = []
805
+
806
+ # 1d to 2d
807
+ b, c, t = x.shape
808
+ if t % self.period != 0: # pad first
809
+ n_pad = self.period - (t % self.period)
810
+ x = F.pad(x, (0, n_pad), "reflect")
811
+ t = t + n_pad
812
+ x = x.view(b, c, t // self.period, self.period)
813
+
814
+ for l in self.convs:
815
+ x = l(x)
816
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
817
+ fmap.append(x)
818
+ x = self.conv_post(x)
819
+ fmap.append(x)
820
+ x = torch.flatten(x, 1, -1)
821
+
822
+ return x, fmap
src/vc/infer_pack/models_onnx_moess.py ADDED
@@ -0,0 +1,853 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math, pdb, os
2
+ from time import time as ttime
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from vc.infer_pack import modules
7
+ from vc.infer_pack import attentions
8
+ from vc.infer_pack import commons
9
+ from vc.infer_pack.commons import init_weights, get_padding
10
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+ from vc.infer_pack.commons import init_weights
13
+ import numpy as np
14
+ from vc.infer_pack import commons
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder256Sim(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(256, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ x = self.proj(x) * x_mask
106
+ return x, x_mask
107
+
108
+
109
+ class ResidualCouplingBlock(nn.Module):
110
+ def __init__(
111
+ self,
112
+ channels,
113
+ hidden_channels,
114
+ kernel_size,
115
+ dilation_rate,
116
+ n_layers,
117
+ n_flows=4,
118
+ gin_channels=0,
119
+ ):
120
+ super().__init__()
121
+ self.channels = channels
122
+ self.hidden_channels = hidden_channels
123
+ self.kernel_size = kernel_size
124
+ self.dilation_rate = dilation_rate
125
+ self.n_layers = n_layers
126
+ self.n_flows = n_flows
127
+ self.gin_channels = gin_channels
128
+
129
+ self.flows = nn.ModuleList()
130
+ for i in range(n_flows):
131
+ self.flows.append(
132
+ modules.ResidualCouplingLayer(
133
+ channels,
134
+ hidden_channels,
135
+ kernel_size,
136
+ dilation_rate,
137
+ n_layers,
138
+ gin_channels=gin_channels,
139
+ mean_only=True,
140
+ )
141
+ )
142
+ self.flows.append(modules.Flip())
143
+
144
+ def forward(self, x, x_mask, g=None, reverse=False):
145
+ if not reverse:
146
+ for flow in self.flows:
147
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
148
+ else:
149
+ for flow in reversed(self.flows):
150
+ x = flow(x, x_mask, g=g, reverse=reverse)
151
+ return x
152
+
153
+ def remove_weight_norm(self):
154
+ for i in range(self.n_flows):
155
+ self.flows[i * 2].remove_weight_norm()
156
+
157
+
158
+ class PosteriorEncoder(nn.Module):
159
+ def __init__(
160
+ self,
161
+ in_channels,
162
+ out_channels,
163
+ hidden_channels,
164
+ kernel_size,
165
+ dilation_rate,
166
+ n_layers,
167
+ gin_channels=0,
168
+ ):
169
+ super().__init__()
170
+ self.in_channels = in_channels
171
+ self.out_channels = out_channels
172
+ self.hidden_channels = hidden_channels
173
+ self.kernel_size = kernel_size
174
+ self.dilation_rate = dilation_rate
175
+ self.n_layers = n_layers
176
+ self.gin_channels = gin_channels
177
+
178
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
179
+ self.enc = modules.WN(
180
+ hidden_channels,
181
+ kernel_size,
182
+ dilation_rate,
183
+ n_layers,
184
+ gin_channels=gin_channels,
185
+ )
186
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
187
+
188
+ def forward(self, x, x_lengths, g=None):
189
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
190
+ x.dtype
191
+ )
192
+ x = self.pre(x) * x_mask
193
+ x = self.enc(x, x_mask, g=g)
194
+ stats = self.proj(x) * x_mask
195
+ m, logs = torch.split(stats, self.out_channels, dim=1)
196
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
197
+ return z, m, logs, x_mask
198
+
199
+ def remove_weight_norm(self):
200
+ self.enc.remove_weight_norm()
201
+
202
+
203
+ class Generator(torch.nn.Module):
204
+ def __init__(
205
+ self,
206
+ initial_channel,
207
+ resblock,
208
+ resblock_kernel_sizes,
209
+ resblock_dilation_sizes,
210
+ upsample_rates,
211
+ upsample_initial_channel,
212
+ upsample_kernel_sizes,
213
+ gin_channels=0,
214
+ ):
215
+ super(Generator, self).__init__()
216
+ self.num_kernels = len(resblock_kernel_sizes)
217
+ self.num_upsamples = len(upsample_rates)
218
+ self.conv_pre = Conv1d(
219
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
220
+ )
221
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
222
+
223
+ self.ups = nn.ModuleList()
224
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
225
+ self.ups.append(
226
+ weight_norm(
227
+ ConvTranspose1d(
228
+ upsample_initial_channel // (2**i),
229
+ upsample_initial_channel // (2 ** (i + 1)),
230
+ k,
231
+ u,
232
+ padding=(k - u) // 2,
233
+ )
234
+ )
235
+ )
236
+
237
+ self.resblocks = nn.ModuleList()
238
+ for i in range(len(self.ups)):
239
+ ch = upsample_initial_channel // (2 ** (i + 1))
240
+ for j, (k, d) in enumerate(
241
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
242
+ ):
243
+ self.resblocks.append(resblock(ch, k, d))
244
+
245
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
246
+ self.ups.apply(init_weights)
247
+
248
+ if gin_channels != 0:
249
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
250
+
251
+ def forward(self, x, g=None):
252
+ x = self.conv_pre(x)
253
+ if g is not None:
254
+ x = x + self.cond(g)
255
+
256
+ for i in range(self.num_upsamples):
257
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
258
+ x = self.ups[i](x)
259
+ xs = None
260
+ for j in range(self.num_kernels):
261
+ if xs is None:
262
+ xs = self.resblocks[i * self.num_kernels + j](x)
263
+ else:
264
+ xs += self.resblocks[i * self.num_kernels + j](x)
265
+ x = xs / self.num_kernels
266
+ x = F.leaky_relu(x)
267
+ x = self.conv_post(x)
268
+ x = torch.tanh(x)
269
+
270
+ return x
271
+
272
+ def remove_weight_norm(self):
273
+ for l in self.ups:
274
+ remove_weight_norm(l)
275
+ for l in self.resblocks:
276
+ l.remove_weight_norm()
277
+
278
+
279
+ class SineGen(torch.nn.Module):
280
+ """Definition of sine generator
281
+ SineGen(samp_rate, harmonic_num = 0,
282
+ sine_amp = 0.1, noise_std = 0.003,
283
+ voiced_threshold = 0,
284
+ flag_for_pulse=False)
285
+ samp_rate: sampling rate in Hz
286
+ harmonic_num: number of harmonic overtones (default 0)
287
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
288
+ noise_std: std of Gaussian noise (default 0.003)
289
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
290
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
291
+ Note: when flag_for_pulse is True, the first time step of a voiced
292
+ segment is always sin(np.pi) or cos(0)
293
+ """
294
+
295
+ def __init__(
296
+ self,
297
+ samp_rate,
298
+ harmonic_num=0,
299
+ sine_amp=0.1,
300
+ noise_std=0.003,
301
+ voiced_threshold=0,
302
+ flag_for_pulse=False,
303
+ ):
304
+ super(SineGen, self).__init__()
305
+ self.sine_amp = sine_amp
306
+ self.noise_std = noise_std
307
+ self.harmonic_num = harmonic_num
308
+ self.dim = self.harmonic_num + 1
309
+ self.sampling_rate = samp_rate
310
+ self.voiced_threshold = voiced_threshold
311
+
312
+ def _f02uv(self, f0):
313
+ # generate uv signal
314
+ uv = torch.ones_like(f0)
315
+ uv = uv * (f0 > self.voiced_threshold)
316
+ return uv
317
+
318
+ def forward(self, f0, upp):
319
+ """sine_tensor, uv = forward(f0)
320
+ input F0: tensor(batchsize=1, length, dim=1)
321
+ f0 for unvoiced steps should be 0
322
+ output sine_tensor: tensor(batchsize=1, length, dim)
323
+ output uv: tensor(batchsize=1, length, 1)
324
+ """
325
+ with torch.no_grad():
326
+ f0 = f0[:, None].transpose(1, 2)
327
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
328
+ # fundamental component
329
+ f0_buf[:, :, 0] = f0[:, :, 0]
330
+ for idx in np.arange(self.harmonic_num):
331
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
332
+ idx + 2
333
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
334
+ rad_values = (
335
+ f0_buf / self.sampling_rate
336
+ ) % 1 ###%1意味着n_har的乘积无法后处理优化
337
+ rand_ini = torch.rand(
338
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
+ )
340
+ rand_ini[:, 0] = 0
341
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
+ tmp_over_one = torch.cumsum(
343
+ rad_values, 1
344
+ ) # % 1 #####%1意味着后面的cumsum无法再优化
345
+ tmp_over_one *= upp
346
+ tmp_over_one = F.interpolate(
347
+ tmp_over_one.transpose(2, 1),
348
+ scale_factor=upp,
349
+ mode="linear",
350
+ align_corners=True,
351
+ ).transpose(2, 1)
352
+ rad_values = F.interpolate(
353
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
354
+ ).transpose(
355
+ 2, 1
356
+ ) #######
357
+ tmp_over_one %= 1
358
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
359
+ cumsum_shift = torch.zeros_like(rad_values)
360
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
361
+ sine_waves = torch.sin(
362
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
363
+ )
364
+ sine_waves = sine_waves * self.sine_amp
365
+ uv = self._f02uv(f0)
366
+ uv = F.interpolate(
367
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
368
+ ).transpose(2, 1)
369
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
370
+ noise = noise_amp * torch.randn_like(sine_waves)
371
+ sine_waves = sine_waves * uv + noise
372
+ return sine_waves, uv, noise
373
+
374
+
375
+ class SourceModuleHnNSF(torch.nn.Module):
376
+ """SourceModule for hn-nsf
377
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
378
+ add_noise_std=0.003, voiced_threshod=0)
379
+ sampling_rate: sampling_rate in Hz
380
+ harmonic_num: number of harmonic above F0 (default: 0)
381
+ sine_amp: amplitude of sine source signal (default: 0.1)
382
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
383
+ note that amplitude of noise in unvoiced is decided
384
+ by sine_amp
385
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
386
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
387
+ F0_sampled (batchsize, length, 1)
388
+ Sine_source (batchsize, length, 1)
389
+ noise_source (batchsize, length 1)
390
+ uv (batchsize, length, 1)
391
+ """
392
+
393
+ def __init__(
394
+ self,
395
+ sampling_rate,
396
+ harmonic_num=0,
397
+ sine_amp=0.1,
398
+ add_noise_std=0.003,
399
+ voiced_threshod=0,
400
+ is_half=True,
401
+ ):
402
+ super(SourceModuleHnNSF, self).__init__()
403
+
404
+ self.sine_amp = sine_amp
405
+ self.noise_std = add_noise_std
406
+ self.is_half = is_half
407
+ # to produce sine waveforms
408
+ self.l_sin_gen = SineGen(
409
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
410
+ )
411
+
412
+ # to merge source harmonics into a single excitation
413
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
414
+ self.l_tanh = torch.nn.Tanh()
415
+
416
+ def forward(self, x, upp=None):
417
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
418
+ if self.is_half:
419
+ sine_wavs = sine_wavs.half()
420
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
421
+ return sine_merge, None, None # noise, uv
422
+
423
+
424
+ class GeneratorNSF(torch.nn.Module):
425
+ def __init__(
426
+ self,
427
+ initial_channel,
428
+ resblock,
429
+ resblock_kernel_sizes,
430
+ resblock_dilation_sizes,
431
+ upsample_rates,
432
+ upsample_initial_channel,
433
+ upsample_kernel_sizes,
434
+ gin_channels,
435
+ sr,
436
+ is_half=False,
437
+ ):
438
+ super(GeneratorNSF, self).__init__()
439
+ self.num_kernels = len(resblock_kernel_sizes)
440
+ self.num_upsamples = len(upsample_rates)
441
+
442
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
443
+ self.m_source = SourceModuleHnNSF(
444
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
445
+ )
446
+ self.noise_convs = nn.ModuleList()
447
+ self.conv_pre = Conv1d(
448
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
449
+ )
450
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
451
+
452
+ self.ups = nn.ModuleList()
453
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
454
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
455
+ self.ups.append(
456
+ weight_norm(
457
+ ConvTranspose1d(
458
+ upsample_initial_channel // (2**i),
459
+ upsample_initial_channel // (2 ** (i + 1)),
460
+ k,
461
+ u,
462
+ padding=(k - u) // 2,
463
+ )
464
+ )
465
+ )
466
+ if i + 1 < len(upsample_rates):
467
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
468
+ self.noise_convs.append(
469
+ Conv1d(
470
+ 1,
471
+ c_cur,
472
+ kernel_size=stride_f0 * 2,
473
+ stride=stride_f0,
474
+ padding=stride_f0 // 2,
475
+ )
476
+ )
477
+ else:
478
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
479
+
480
+ self.resblocks = nn.ModuleList()
481
+ for i in range(len(self.ups)):
482
+ ch = upsample_initial_channel // (2 ** (i + 1))
483
+ for j, (k, d) in enumerate(
484
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
485
+ ):
486
+ self.resblocks.append(resblock(ch, k, d))
487
+
488
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
489
+ self.ups.apply(init_weights)
490
+
491
+ if gin_channels != 0:
492
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
493
+
494
+ self.upp = np.prod(upsample_rates)
495
+
496
+ def forward(self, x, f0, g=None):
497
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
498
+ har_source = har_source.transpose(1, 2)
499
+ x = self.conv_pre(x)
500
+ if g is not None:
501
+ x = x + self.cond(g)
502
+
503
+ for i in range(self.num_upsamples):
504
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
505
+ x = self.ups[i](x)
506
+ x_source = self.noise_convs[i](har_source)
507
+ x = x + x_source
508
+ xs = None
509
+ for j in range(self.num_kernels):
510
+ if xs is None:
511
+ xs = self.resblocks[i * self.num_kernels + j](x)
512
+ else:
513
+ xs += self.resblocks[i * self.num_kernels + j](x)
514
+ x = xs / self.num_kernels
515
+ x = F.leaky_relu(x)
516
+ x = self.conv_post(x)
517
+ x = torch.tanh(x)
518
+ return x
519
+
520
+ def remove_weight_norm(self):
521
+ for l in self.ups:
522
+ remove_weight_norm(l)
523
+ for l in self.resblocks:
524
+ l.remove_weight_norm()
525
+
526
+
527
+ sr2sr = {
528
+ "32k": 32000,
529
+ "40k": 40000,
530
+ "48k": 48000,
531
+ }
532
+
533
+
534
+ class SynthesizerTrnMs256NSFsidM(nn.Module):
535
+ def __init__(
536
+ self,
537
+ spec_channels,
538
+ segment_size,
539
+ inter_channels,
540
+ hidden_channels,
541
+ filter_channels,
542
+ n_heads,
543
+ n_layers,
544
+ kernel_size,
545
+ p_dropout,
546
+ resblock,
547
+ resblock_kernel_sizes,
548
+ resblock_dilation_sizes,
549
+ upsample_rates,
550
+ upsample_initial_channel,
551
+ upsample_kernel_sizes,
552
+ spk_embed_dim,
553
+ gin_channels,
554
+ sr,
555
+ **kwargs
556
+ ):
557
+ super().__init__()
558
+ if type(sr) == type("strr"):
559
+ sr = sr2sr[sr]
560
+ self.spec_channels = spec_channels
561
+ self.inter_channels = inter_channels
562
+ self.hidden_channels = hidden_channels
563
+ self.filter_channels = filter_channels
564
+ self.n_heads = n_heads
565
+ self.n_layers = n_layers
566
+ self.kernel_size = kernel_size
567
+ self.p_dropout = p_dropout
568
+ self.resblock = resblock
569
+ self.resblock_kernel_sizes = resblock_kernel_sizes
570
+ self.resblock_dilation_sizes = resblock_dilation_sizes
571
+ self.upsample_rates = upsample_rates
572
+ self.upsample_initial_channel = upsample_initial_channel
573
+ self.upsample_kernel_sizes = upsample_kernel_sizes
574
+ self.segment_size = segment_size
575
+ self.gin_channels = gin_channels
576
+ # self.hop_length = hop_length#
577
+ self.spk_embed_dim = spk_embed_dim
578
+ self.enc_p = TextEncoder256(
579
+ inter_channels,
580
+ hidden_channels,
581
+ filter_channels,
582
+ n_heads,
583
+ n_layers,
584
+ kernel_size,
585
+ p_dropout,
586
+ )
587
+ self.dec = GeneratorNSF(
588
+ inter_channels,
589
+ resblock,
590
+ resblock_kernel_sizes,
591
+ resblock_dilation_sizes,
592
+ upsample_rates,
593
+ upsample_initial_channel,
594
+ upsample_kernel_sizes,
595
+ gin_channels=gin_channels,
596
+ sr=sr,
597
+ is_half=kwargs["is_half"],
598
+ )
599
+ self.enc_q = PosteriorEncoder(
600
+ spec_channels,
601
+ inter_channels,
602
+ hidden_channels,
603
+ 5,
604
+ 1,
605
+ 16,
606
+ gin_channels=gin_channels,
607
+ )
608
+ self.flow = ResidualCouplingBlock(
609
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
610
+ )
611
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
612
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
613
+
614
+ def remove_weight_norm(self):
615
+ self.dec.remove_weight_norm()
616
+ self.flow.remove_weight_norm()
617
+ self.enc_q.remove_weight_norm()
618
+
619
+ def forward(self, phone, phone_lengths, pitch, nsff0, sid, rnd, max_len=None):
620
+ g = self.emb_g(sid).unsqueeze(-1)
621
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
622
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
623
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
624
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
625
+ return o
626
+
627
+
628
+ class SynthesizerTrnMs256NSFsid_sim(nn.Module):
629
+ """
630
+ Synthesizer for Training
631
+ """
632
+
633
+ def __init__(
634
+ self,
635
+ spec_channels,
636
+ segment_size,
637
+ inter_channels,
638
+ hidden_channels,
639
+ filter_channels,
640
+ n_heads,
641
+ n_layers,
642
+ kernel_size,
643
+ p_dropout,
644
+ resblock,
645
+ resblock_kernel_sizes,
646
+ resblock_dilation_sizes,
647
+ upsample_rates,
648
+ upsample_initial_channel,
649
+ upsample_kernel_sizes,
650
+ spk_embed_dim,
651
+ # hop_length,
652
+ gin_channels=0,
653
+ use_sdp=True,
654
+ **kwargs
655
+ ):
656
+ super().__init__()
657
+ self.spec_channels = spec_channels
658
+ self.inter_channels = inter_channels
659
+ self.hidden_channels = hidden_channels
660
+ self.filter_channels = filter_channels
661
+ self.n_heads = n_heads
662
+ self.n_layers = n_layers
663
+ self.kernel_size = kernel_size
664
+ self.p_dropout = p_dropout
665
+ self.resblock = resblock
666
+ self.resblock_kernel_sizes = resblock_kernel_sizes
667
+ self.resblock_dilation_sizes = resblock_dilation_sizes
668
+ self.upsample_rates = upsample_rates
669
+ self.upsample_initial_channel = upsample_initial_channel
670
+ self.upsample_kernel_sizes = upsample_kernel_sizes
671
+ self.segment_size = segment_size
672
+ self.gin_channels = gin_channels
673
+ # self.hop_length = hop_length#
674
+ self.spk_embed_dim = spk_embed_dim
675
+ self.enc_p = TextEncoder256Sim(
676
+ inter_channels,
677
+ hidden_channels,
678
+ filter_channels,
679
+ n_heads,
680
+ n_layers,
681
+ kernel_size,
682
+ p_dropout,
683
+ )
684
+ self.dec = GeneratorNSF(
685
+ inter_channels,
686
+ resblock,
687
+ resblock_kernel_sizes,
688
+ resblock_dilation_sizes,
689
+ upsample_rates,
690
+ upsample_initial_channel,
691
+ upsample_kernel_sizes,
692
+ gin_channels=gin_channels,
693
+ is_half=kwargs["is_half"],
694
+ )
695
+
696
+ self.flow = ResidualCouplingBlock(
697
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
698
+ )
699
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
700
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
701
+
702
+ def remove_weight_norm(self):
703
+ self.dec.remove_weight_norm()
704
+ self.flow.remove_weight_norm()
705
+ self.enc_q.remove_weight_norm()
706
+
707
+ def forward(
708
+ self, phone, phone_lengths, pitch, pitchf, ds, max_len=None
709
+ ): # y是spec不需要了现在
710
+ g = self.emb_g(ds.unsqueeze(0)).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
711
+ x, x_mask = self.enc_p(phone, pitch, phone_lengths)
712
+ x = self.flow(x, x_mask, g=g, reverse=True)
713
+ o = self.dec((x * x_mask)[:, :, :max_len], pitchf, g=g)
714
+ return o
715
+
716
+
717
+ class MultiPeriodDiscriminator(torch.nn.Module):
718
+ def __init__(self, use_spectral_norm=False):
719
+ super(MultiPeriodDiscriminator, self).__init__()
720
+ periods = [2, 3, 5, 7, 11, 17]
721
+ # periods = [3, 5, 7, 11, 17, 23, 37]
722
+
723
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
724
+ discs = discs + [
725
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
726
+ ]
727
+ self.discriminators = nn.ModuleList(discs)
728
+
729
+ def forward(self, y, y_hat):
730
+ y_d_rs = [] #
731
+ y_d_gs = []
732
+ fmap_rs = []
733
+ fmap_gs = []
734
+ for i, d in enumerate(self.discriminators):
735
+ y_d_r, fmap_r = d(y)
736
+ y_d_g, fmap_g = d(y_hat)
737
+ # for j in range(len(fmap_r)):
738
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
739
+ y_d_rs.append(y_d_r)
740
+ y_d_gs.append(y_d_g)
741
+ fmap_rs.append(fmap_r)
742
+ fmap_gs.append(fmap_g)
743
+
744
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
745
+
746
+
747
+ class DiscriminatorS(torch.nn.Module):
748
+ def __init__(self, use_spectral_norm=False):
749
+ super(DiscriminatorS, self).__init__()
750
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
751
+ self.convs = nn.ModuleList(
752
+ [
753
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
754
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
755
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
756
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
757
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
758
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
759
+ ]
760
+ )
761
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
762
+
763
+ def forward(self, x):
764
+ fmap = []
765
+
766
+ for l in self.convs:
767
+ x = l(x)
768
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
769
+ fmap.append(x)
770
+ x = self.conv_post(x)
771
+ fmap.append(x)
772
+ x = torch.flatten(x, 1, -1)
773
+
774
+ return x, fmap
775
+
776
+
777
+ class DiscriminatorP(torch.nn.Module):
778
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
779
+ super(DiscriminatorP, self).__init__()
780
+ self.period = period
781
+ self.use_spectral_norm = use_spectral_norm
782
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
783
+ self.convs = nn.ModuleList(
784
+ [
785
+ norm_f(
786
+ Conv2d(
787
+ 1,
788
+ 32,
789
+ (kernel_size, 1),
790
+ (stride, 1),
791
+ padding=(get_padding(kernel_size, 1), 0),
792
+ )
793
+ ),
794
+ norm_f(
795
+ Conv2d(
796
+ 32,
797
+ 128,
798
+ (kernel_size, 1),
799
+ (stride, 1),
800
+ padding=(get_padding(kernel_size, 1), 0),
801
+ )
802
+ ),
803
+ norm_f(
804
+ Conv2d(
805
+ 128,
806
+ 512,
807
+ (kernel_size, 1),
808
+ (stride, 1),
809
+ padding=(get_padding(kernel_size, 1), 0),
810
+ )
811
+ ),
812
+ norm_f(
813
+ Conv2d(
814
+ 512,
815
+ 1024,
816
+ (kernel_size, 1),
817
+ (stride, 1),
818
+ padding=(get_padding(kernel_size, 1), 0),
819
+ )
820
+ ),
821
+ norm_f(
822
+ Conv2d(
823
+ 1024,
824
+ 1024,
825
+ (kernel_size, 1),
826
+ 1,
827
+ padding=(get_padding(kernel_size, 1), 0),
828
+ )
829
+ ),
830
+ ]
831
+ )
832
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
833
+
834
+ def forward(self, x):
835
+ fmap = []
836
+
837
+ # 1d to 2d
838
+ b, c, t = x.shape
839
+ if t % self.period != 0: # pad first
840
+ n_pad = self.period - (t % self.period)
841
+ x = F.pad(x, (0, n_pad), "reflect")
842
+ t = t + n_pad
843
+ x = x.view(b, c, t // self.period, self.period)
844
+
845
+ for l in self.convs:
846
+ x = l(x)
847
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
848
+ fmap.append(x)
849
+ x = self.conv_post(x)
850
+ fmap.append(x)
851
+ x = torch.flatten(x, 1, -1)
852
+
853
+ return x, fmap
src/vc/infer_pack/modules.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ from vc.infer_pack import commons
13
+ from vc.infer_pack.commons import init_weights, get_padding
14
+ from vc.infer_pack.transforms import piecewise_rational_quadratic_transform
15
+
16
+
17
+ LRELU_SLOPE = 0.1
18
+
19
+
20
+ class LayerNorm(nn.Module):
21
+ def __init__(self, channels, eps=1e-5):
22
+ super().__init__()
23
+ self.channels = channels
24
+ self.eps = eps
25
+
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class ConvReluNorm(nn.Module):
36
+ def __init__(
37
+ self,
38
+ in_channels,
39
+ hidden_channels,
40
+ out_channels,
41
+ kernel_size,
42
+ n_layers,
43
+ p_dropout,
44
+ ):
45
+ super().__init__()
46
+ self.in_channels = in_channels
47
+ self.hidden_channels = hidden_channels
48
+ self.out_channels = out_channels
49
+ self.kernel_size = kernel_size
50
+ self.n_layers = n_layers
51
+ self.p_dropout = p_dropout
52
+ assert n_layers > 1, "Number of layers should be larger than 0."
53
+
54
+ self.conv_layers = nn.ModuleList()
55
+ self.norm_layers = nn.ModuleList()
56
+ self.conv_layers.append(
57
+ nn.Conv1d(
58
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
59
+ )
60
+ )
61
+ self.norm_layers.append(LayerNorm(hidden_channels))
62
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
63
+ for _ in range(n_layers - 1):
64
+ self.conv_layers.append(
65
+ nn.Conv1d(
66
+ hidden_channels,
67
+ hidden_channels,
68
+ kernel_size,
69
+ padding=kernel_size // 2,
70
+ )
71
+ )
72
+ self.norm_layers.append(LayerNorm(hidden_channels))
73
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
74
+ self.proj.weight.data.zero_()
75
+ self.proj.bias.data.zero_()
76
+
77
+ def forward(self, x, x_mask):
78
+ x_org = x
79
+ for i in range(self.n_layers):
80
+ x = self.conv_layers[i](x * x_mask)
81
+ x = self.norm_layers[i](x)
82
+ x = self.relu_drop(x)
83
+ x = x_org + self.proj(x)
84
+ return x * x_mask
85
+
86
+
87
+ class DDSConv(nn.Module):
88
+ """
89
+ Dialted and Depth-Separable Convolution
90
+ """
91
+
92
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
93
+ super().__init__()
94
+ self.channels = channels
95
+ self.kernel_size = kernel_size
96
+ self.n_layers = n_layers
97
+ self.p_dropout = p_dropout
98
+
99
+ self.drop = nn.Dropout(p_dropout)
100
+ self.convs_sep = nn.ModuleList()
101
+ self.convs_1x1 = nn.ModuleList()
102
+ self.norms_1 = nn.ModuleList()
103
+ self.norms_2 = nn.ModuleList()
104
+ for i in range(n_layers):
105
+ dilation = kernel_size**i
106
+ padding = (kernel_size * dilation - dilation) // 2
107
+ self.convs_sep.append(
108
+ nn.Conv1d(
109
+ channels,
110
+ channels,
111
+ kernel_size,
112
+ groups=channels,
113
+ dilation=dilation,
114
+ padding=padding,
115
+ )
116
+ )
117
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118
+ self.norms_1.append(LayerNorm(channels))
119
+ self.norms_2.append(LayerNorm(channels))
120
+
121
+ def forward(self, x, x_mask, g=None):
122
+ if g is not None:
123
+ x = x + g
124
+ for i in range(self.n_layers):
125
+ y = self.convs_sep[i](x * x_mask)
126
+ y = self.norms_1[i](y)
127
+ y = F.gelu(y)
128
+ y = self.convs_1x1[i](y)
129
+ y = self.norms_2[i](y)
130
+ y = F.gelu(y)
131
+ y = self.drop(y)
132
+ x = x + y
133
+ return x * x_mask
134
+
135
+
136
+ class WN(torch.nn.Module):
137
+ def __init__(
138
+ self,
139
+ hidden_channels,
140
+ kernel_size,
141
+ dilation_rate,
142
+ n_layers,
143
+ gin_channels=0,
144
+ p_dropout=0,
145
+ ):
146
+ super(WN, self).__init__()
147
+ assert kernel_size % 2 == 1
148
+ self.hidden_channels = hidden_channels
149
+ self.kernel_size = (kernel_size,)
150
+ self.dilation_rate = dilation_rate
151
+ self.n_layers = n_layers
152
+ self.gin_channels = gin_channels
153
+ self.p_dropout = p_dropout
154
+
155
+ self.in_layers = torch.nn.ModuleList()
156
+ self.res_skip_layers = torch.nn.ModuleList()
157
+ self.drop = nn.Dropout(p_dropout)
158
+
159
+ if gin_channels != 0:
160
+ cond_layer = torch.nn.Conv1d(
161
+ gin_channels, 2 * hidden_channels * n_layers, 1
162
+ )
163
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164
+
165
+ for i in range(n_layers):
166
+ dilation = dilation_rate**i
167
+ padding = int((kernel_size * dilation - dilation) / 2)
168
+ in_layer = torch.nn.Conv1d(
169
+ hidden_channels,
170
+ 2 * hidden_channels,
171
+ kernel_size,
172
+ dilation=dilation,
173
+ padding=padding,
174
+ )
175
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176
+ self.in_layers.append(in_layer)
177
+
178
+ # last one is not necessary
179
+ if i < n_layers - 1:
180
+ res_skip_channels = 2 * hidden_channels
181
+ else:
182
+ res_skip_channels = hidden_channels
183
+
184
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186
+ self.res_skip_layers.append(res_skip_layer)
187
+
188
+ def forward(self, x, x_mask, g=None, **kwargs):
189
+ output = torch.zeros_like(x)
190
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
191
+
192
+ if g is not None:
193
+ g = self.cond_layer(g)
194
+
195
+ for i in range(self.n_layers):
196
+ x_in = self.in_layers[i](x)
197
+ if g is not None:
198
+ cond_offset = i * 2 * self.hidden_channels
199
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
200
+ else:
201
+ g_l = torch.zeros_like(x_in)
202
+
203
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
204
+ acts = self.drop(acts)
205
+
206
+ res_skip_acts = self.res_skip_layers[i](acts)
207
+ if i < self.n_layers - 1:
208
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
209
+ x = (x + res_acts) * x_mask
210
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
211
+ else:
212
+ output = output + res_skip_acts
213
+ return output * x_mask
214
+
215
+ def remove_weight_norm(self):
216
+ if self.gin_channels != 0:
217
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
218
+ for l in self.in_layers:
219
+ torch.nn.utils.remove_weight_norm(l)
220
+ for l in self.res_skip_layers:
221
+ torch.nn.utils.remove_weight_norm(l)
222
+
223
+
224
+ class ResBlock1(torch.nn.Module):
225
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
226
+ super(ResBlock1, self).__init__()
227
+ self.convs1 = nn.ModuleList(
228
+ [
229
+ weight_norm(
230
+ Conv1d(
231
+ channels,
232
+ channels,
233
+ kernel_size,
234
+ 1,
235
+ dilation=dilation[0],
236
+ padding=get_padding(kernel_size, dilation[0]),
237
+ )
238
+ ),
239
+ weight_norm(
240
+ Conv1d(
241
+ channels,
242
+ channels,
243
+ kernel_size,
244
+ 1,
245
+ dilation=dilation[1],
246
+ padding=get_padding(kernel_size, dilation[1]),
247
+ )
248
+ ),
249
+ weight_norm(
250
+ Conv1d(
251
+ channels,
252
+ channels,
253
+ kernel_size,
254
+ 1,
255
+ dilation=dilation[2],
256
+ padding=get_padding(kernel_size, dilation[2]),
257
+ )
258
+ ),
259
+ ]
260
+ )
261
+ self.convs1.apply(init_weights)
262
+
263
+ self.convs2 = nn.ModuleList(
264
+ [
265
+ weight_norm(
266
+ Conv1d(
267
+ channels,
268
+ channels,
269
+ kernel_size,
270
+ 1,
271
+ dilation=1,
272
+ padding=get_padding(kernel_size, 1),
273
+ )
274
+ ),
275
+ weight_norm(
276
+ Conv1d(
277
+ channels,
278
+ channels,
279
+ kernel_size,
280
+ 1,
281
+ dilation=1,
282
+ padding=get_padding(kernel_size, 1),
283
+ )
284
+ ),
285
+ weight_norm(
286
+ Conv1d(
287
+ channels,
288
+ channels,
289
+ kernel_size,
290
+ 1,
291
+ dilation=1,
292
+ padding=get_padding(kernel_size, 1),
293
+ )
294
+ ),
295
+ ]
296
+ )
297
+ self.convs2.apply(init_weights)
298
+
299
+ def forward(self, x, x_mask=None):
300
+ for c1, c2 in zip(self.convs1, self.convs2):
301
+ xt = F.leaky_relu(x, LRELU_SLOPE)
302
+ if x_mask is not None:
303
+ xt = xt * x_mask
304
+ xt = c1(xt)
305
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
306
+ if x_mask is not None:
307
+ xt = xt * x_mask
308
+ xt = c2(xt)
309
+ x = xt + x
310
+ if x_mask is not None:
311
+ x = x * x_mask
312
+ return x
313
+
314
+ def remove_weight_norm(self):
315
+ for l in self.convs1:
316
+ remove_weight_norm(l)
317
+ for l in self.convs2:
318
+ remove_weight_norm(l)
319
+
320
+
321
+ class ResBlock2(torch.nn.Module):
322
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
323
+ super(ResBlock2, self).__init__()
324
+ self.convs = nn.ModuleList(
325
+ [
326
+ weight_norm(
327
+ Conv1d(
328
+ channels,
329
+ channels,
330
+ kernel_size,
331
+ 1,
332
+ dilation=dilation[0],
333
+ padding=get_padding(kernel_size, dilation[0]),
334
+ )
335
+ ),
336
+ weight_norm(
337
+ Conv1d(
338
+ channels,
339
+ channels,
340
+ kernel_size,
341
+ 1,
342
+ dilation=dilation[1],
343
+ padding=get_padding(kernel_size, dilation[1]),
344
+ )
345
+ ),
346
+ ]
347
+ )
348
+ self.convs.apply(init_weights)
349
+
350
+ def forward(self, x, x_mask=None):
351
+ for c in self.convs:
352
+ xt = F.leaky_relu(x, LRELU_SLOPE)
353
+ if x_mask is not None:
354
+ xt = xt * x_mask
355
+ xt = c(xt)
356
+ x = xt + x
357
+ if x_mask is not None:
358
+ x = x * x_mask
359
+ return x
360
+
361
+ def remove_weight_norm(self):
362
+ for l in self.convs:
363
+ remove_weight_norm(l)
364
+
365
+
366
+ class Log(nn.Module):
367
+ def forward(self, x, x_mask, reverse=False, **kwargs):
368
+ if not reverse:
369
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
370
+ logdet = torch.sum(-y, [1, 2])
371
+ return y, logdet
372
+ else:
373
+ x = torch.exp(x) * x_mask
374
+ return x
375
+
376
+
377
+ class Flip(nn.Module):
378
+ def forward(self, x, *args, reverse=False, **kwargs):
379
+ x = torch.flip(x, [1])
380
+ if not reverse:
381
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
382
+ return x, logdet
383
+ else:
384
+ return x
385
+
386
+
387
+ class ElementwiseAffine(nn.Module):
388
+ def __init__(self, channels):
389
+ super().__init__()
390
+ self.channels = channels
391
+ self.m = nn.Parameter(torch.zeros(channels, 1))
392
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
393
+
394
+ def forward(self, x, x_mask, reverse=False, **kwargs):
395
+ if not reverse:
396
+ y = self.m + torch.exp(self.logs) * x
397
+ y = y * x_mask
398
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
399
+ return y, logdet
400
+ else:
401
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
402
+ return x
403
+
404
+
405
+ class ResidualCouplingLayer(nn.Module):
406
+ def __init__(
407
+ self,
408
+ channels,
409
+ hidden_channels,
410
+ kernel_size,
411
+ dilation_rate,
412
+ n_layers,
413
+ p_dropout=0,
414
+ gin_channels=0,
415
+ mean_only=False,
416
+ ):
417
+ assert channels % 2 == 0, "channels should be divisible by 2"
418
+ super().__init__()
419
+ self.channels = channels
420
+ self.hidden_channels = hidden_channels
421
+ self.kernel_size = kernel_size
422
+ self.dilation_rate = dilation_rate
423
+ self.n_layers = n_layers
424
+ self.half_channels = channels // 2
425
+ self.mean_only = mean_only
426
+
427
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
428
+ self.enc = WN(
429
+ hidden_channels,
430
+ kernel_size,
431
+ dilation_rate,
432
+ n_layers,
433
+ p_dropout=p_dropout,
434
+ gin_channels=gin_channels,
435
+ )
436
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
437
+ self.post.weight.data.zero_()
438
+ self.post.bias.data.zero_()
439
+
440
+ def forward(self, x, x_mask, g=None, reverse=False):
441
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
442
+ h = self.pre(x0) * x_mask
443
+ h = self.enc(h, x_mask, g=g)
444
+ stats = self.post(h) * x_mask
445
+ if not self.mean_only:
446
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
447
+ else:
448
+ m = stats
449
+ logs = torch.zeros_like(m)
450
+
451
+ if not reverse:
452
+ x1 = m + x1 * torch.exp(logs) * x_mask
453
+ x = torch.cat([x0, x1], 1)
454
+ logdet = torch.sum(logs, [1, 2])
455
+ return x, logdet
456
+ else:
457
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
458
+ x = torch.cat([x0, x1], 1)
459
+ return x
460
+
461
+ def remove_weight_norm(self):
462
+ self.enc.remove_weight_norm()
463
+
464
+
465
+ class ConvFlow(nn.Module):
466
+ def __init__(
467
+ self,
468
+ in_channels,
469
+ filter_channels,
470
+ kernel_size,
471
+ n_layers,
472
+ num_bins=10,
473
+ tail_bound=5.0,
474
+ ):
475
+ super().__init__()
476
+ self.in_channels = in_channels
477
+ self.filter_channels = filter_channels
478
+ self.kernel_size = kernel_size
479
+ self.n_layers = n_layers
480
+ self.num_bins = num_bins
481
+ self.tail_bound = tail_bound
482
+ self.half_channels = in_channels // 2
483
+
484
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
485
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
486
+ self.proj = nn.Conv1d(
487
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
488
+ )
489
+ self.proj.weight.data.zero_()
490
+ self.proj.bias.data.zero_()
491
+
492
+ def forward(self, x, x_mask, g=None, reverse=False):
493
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
494
+ h = self.pre(x0)
495
+ h = self.convs(h, x_mask, g=g)
496
+ h = self.proj(h) * x_mask
497
+
498
+ b, c, t = x0.shape
499
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
500
+
501
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
502
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
503
+ self.filter_channels
504
+ )
505
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
506
+
507
+ x1, logabsdet = piecewise_rational_quadratic_transform(
508
+ x1,
509
+ unnormalized_widths,
510
+ unnormalized_heights,
511
+ unnormalized_derivatives,
512
+ inverse=reverse,
513
+ tails="linear",
514
+ tail_bound=self.tail_bound,
515
+ )
516
+
517
+ x = torch.cat([x0, x1], 1) * x_mask
518
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
519
+ if not reverse:
520
+ return x, logdet
521
+ else:
522
+ return x
src/vc/infer_pack/transforms.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import numpy as np
5
+
6
+
7
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
8
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
9
+ DEFAULT_MIN_DERIVATIVE = 1e-3
10
+
11
+
12
+ def piecewise_rational_quadratic_transform(
13
+ inputs,
14
+ unnormalized_widths,
15
+ unnormalized_heights,
16
+ unnormalized_derivatives,
17
+ inverse=False,
18
+ tails=None,
19
+ tail_bound=1.0,
20
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
23
+ ):
24
+ if tails is None:
25
+ spline_fn = rational_quadratic_spline
26
+ spline_kwargs = {}
27
+ else:
28
+ spline_fn = unconstrained_rational_quadratic_spline
29
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30
+
31
+ outputs, logabsdet = spline_fn(
32
+ inputs=inputs,
33
+ unnormalized_widths=unnormalized_widths,
34
+ unnormalized_heights=unnormalized_heights,
35
+ unnormalized_derivatives=unnormalized_derivatives,
36
+ inverse=inverse,
37
+ min_bin_width=min_bin_width,
38
+ min_bin_height=min_bin_height,
39
+ min_derivative=min_derivative,
40
+ **spline_kwargs
41
+ )
42
+ return outputs, logabsdet
43
+
44
+
45
+ def searchsorted(bin_locations, inputs, eps=1e-6):
46
+ bin_locations[..., -1] += eps
47
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48
+
49
+
50
+ def unconstrained_rational_quadratic_spline(
51
+ inputs,
52
+ unnormalized_widths,
53
+ unnormalized_heights,
54
+ unnormalized_derivatives,
55
+ inverse=False,
56
+ tails="linear",
57
+ tail_bound=1.0,
58
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
61
+ ):
62
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63
+ outside_interval_mask = ~inside_interval_mask
64
+
65
+ outputs = torch.zeros_like(inputs)
66
+ logabsdet = torch.zeros_like(inputs)
67
+
68
+ if tails == "linear":
69
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70
+ constant = np.log(np.exp(1 - min_derivative) - 1)
71
+ unnormalized_derivatives[..., 0] = constant
72
+ unnormalized_derivatives[..., -1] = constant
73
+
74
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
75
+ logabsdet[outside_interval_mask] = 0
76
+ else:
77
+ raise RuntimeError("{} tails are not implemented.".format(tails))
78
+
79
+ (
80
+ outputs[inside_interval_mask],
81
+ logabsdet[inside_interval_mask],
82
+ ) = rational_quadratic_spline(
83
+ inputs=inputs[inside_interval_mask],
84
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87
+ inverse=inverse,
88
+ left=-tail_bound,
89
+ right=tail_bound,
90
+ bottom=-tail_bound,
91
+ top=tail_bound,
92
+ min_bin_width=min_bin_width,
93
+ min_bin_height=min_bin_height,
94
+ min_derivative=min_derivative,
95
+ )
96
+
97
+ return outputs, logabsdet
98
+
99
+
100
+ def rational_quadratic_spline(
101
+ inputs,
102
+ unnormalized_widths,
103
+ unnormalized_heights,
104
+ unnormalized_derivatives,
105
+ inverse=False,
106
+ left=0.0,
107
+ right=1.0,
108
+ bottom=0.0,
109
+ top=1.0,
110
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
113
+ ):
114
+ if torch.min(inputs) < left or torch.max(inputs) > right:
115
+ raise ValueError("Input to a transform is not within its domain")
116
+
117
+ num_bins = unnormalized_widths.shape[-1]
118
+
119
+ if min_bin_width * num_bins > 1.0:
120
+ raise ValueError("Minimal bin width too large for the number of bins")
121
+ if min_bin_height * num_bins > 1.0:
122
+ raise ValueError("Minimal bin height too large for the number of bins")
123
+
124
+ widths = F.softmax(unnormalized_widths, dim=-1)
125
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126
+ cumwidths = torch.cumsum(widths, dim=-1)
127
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128
+ cumwidths = (right - left) * cumwidths + left
129
+ cumwidths[..., 0] = left
130
+ cumwidths[..., -1] = right
131
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132
+
133
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134
+
135
+ heights = F.softmax(unnormalized_heights, dim=-1)
136
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137
+ cumheights = torch.cumsum(heights, dim=-1)
138
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139
+ cumheights = (top - bottom) * cumheights + bottom
140
+ cumheights[..., 0] = bottom
141
+ cumheights[..., -1] = top
142
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
143
+
144
+ if inverse:
145
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
146
+ else:
147
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
148
+
149
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151
+
152
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153
+ delta = heights / widths
154
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
155
+
156
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158
+
159
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
160
+
161
+ if inverse:
162
+ a = (inputs - input_cumheights) * (
163
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
164
+ ) + input_heights * (input_delta - input_derivatives)
165
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
167
+ )
168
+ c = -input_delta * (inputs - input_cumheights)
169
+
170
+ discriminant = b.pow(2) - 4 * a * c
171
+ assert (discriminant >= 0).all()
172
+
173
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
174
+ outputs = root * input_bin_widths + input_cumwidths
175
+
176
+ theta_one_minus_theta = root * (1 - root)
177
+ denominator = input_delta + (
178
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179
+ * theta_one_minus_theta
180
+ )
181
+ derivative_numerator = input_delta.pow(2) * (
182
+ input_derivatives_plus_one * root.pow(2)
183
+ + 2 * input_delta * theta_one_minus_theta
184
+ + input_derivatives * (1 - root).pow(2)
185
+ )
186
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187
+
188
+ return outputs, -logabsdet
189
+ else:
190
+ theta = (inputs - input_cumwidths) / input_bin_widths
191
+ theta_one_minus_theta = theta * (1 - theta)
192
+
193
+ numerator = input_heights * (
194
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195
+ )
196
+ denominator = input_delta + (
197
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198
+ * theta_one_minus_theta
199
+ )
200
+ outputs = input_cumheights + numerator / denominator
201
+
202
+ derivative_numerator = input_delta.pow(2) * (
203
+ input_derivatives_plus_one * theta.pow(2)
204
+ + 2 * input_delta * theta_one_minus_theta
205
+ + input_derivatives * (1 - theta).pow(2)
206
+ )
207
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208
+
209
+ return outputs, logabsdet
src/vc/my_utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpeg
2
+ import numpy as np
3
+
4
+
5
+ def load_audio(file, sr):
6
+ try:
7
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
8
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
9
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10
+ file = (
11
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12
+ ) # 防止小白拷路径头尾带了空格和"和回车
13
+ out, _ = (
14
+ ffmpeg.input(file, threads=0)
15
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17
+ )
18
+ except Exception as e:
19
+ raise RuntimeError(f"Failed to load audio: {e}")
20
+
21
+ return np.frombuffer(out, np.float32).flatten()
src/vc/rmvpe.py ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from librosa.filters import mel
6
+
7
+
8
+ class BiGRU(nn.Module):
9
+ def __init__(self, input_features, hidden_features, num_layers):
10
+ super(BiGRU, self).__init__()
11
+ self.gru = nn.GRU(
12
+ input_features,
13
+ hidden_features,
14
+ num_layers=num_layers,
15
+ batch_first=True,
16
+ bidirectional=True,
17
+ )
18
+
19
+ def forward(self, x):
20
+ return self.gru(x)[0]
21
+
22
+
23
+ class ConvBlockRes(nn.Module):
24
+ def __init__(self, in_channels, out_channels, momentum=0.01):
25
+ super(ConvBlockRes, self).__init__()
26
+ self.conv = nn.Sequential(
27
+ nn.Conv2d(
28
+ in_channels=in_channels,
29
+ out_channels=out_channels,
30
+ kernel_size=(3, 3),
31
+ stride=(1, 1),
32
+ padding=(1, 1),
33
+ bias=False,
34
+ ),
35
+ nn.BatchNorm2d(out_channels, momentum=momentum),
36
+ nn.ReLU(),
37
+ nn.Conv2d(
38
+ in_channels=out_channels,
39
+ out_channels=out_channels,
40
+ kernel_size=(3, 3),
41
+ stride=(1, 1),
42
+ padding=(1, 1),
43
+ bias=False,
44
+ ),
45
+ nn.BatchNorm2d(out_channels, momentum=momentum),
46
+ nn.ReLU(),
47
+ )
48
+ if in_channels != out_channels:
49
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
50
+ self.is_shortcut = True
51
+ else:
52
+ self.is_shortcut = False
53
+
54
+ def forward(self, x):
55
+ if self.is_shortcut:
56
+ return self.conv(x) + self.shortcut(x)
57
+ else:
58
+ return self.conv(x) + x
59
+
60
+
61
+ class Encoder(nn.Module):
62
+ def __init__(
63
+ self,
64
+ in_channels,
65
+ in_size,
66
+ n_encoders,
67
+ kernel_size,
68
+ n_blocks,
69
+ out_channels=16,
70
+ momentum=0.01,
71
+ ):
72
+ super(Encoder, self).__init__()
73
+ self.n_encoders = n_encoders
74
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
75
+ self.layers = nn.ModuleList()
76
+ self.latent_channels = []
77
+ for i in range(self.n_encoders):
78
+ self.layers.append(
79
+ ResEncoderBlock(
80
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
81
+ )
82
+ )
83
+ self.latent_channels.append([out_channels, in_size])
84
+ in_channels = out_channels
85
+ out_channels *= 2
86
+ in_size //= 2
87
+ self.out_size = in_size
88
+ self.out_channel = out_channels
89
+
90
+ def forward(self, x):
91
+ concat_tensors = []
92
+ x = self.bn(x)
93
+ for i in range(self.n_encoders):
94
+ _, x = self.layers[i](x)
95
+ concat_tensors.append(_)
96
+ return x, concat_tensors
97
+
98
+
99
+ class ResEncoderBlock(nn.Module):
100
+ def __init__(
101
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
102
+ ):
103
+ super(ResEncoderBlock, self).__init__()
104
+ self.n_blocks = n_blocks
105
+ self.conv = nn.ModuleList()
106
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
107
+ for i in range(n_blocks - 1):
108
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
109
+ self.kernel_size = kernel_size
110
+ if self.kernel_size is not None:
111
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
112
+
113
+ def forward(self, x):
114
+ for i in range(self.n_blocks):
115
+ x = self.conv[i](x)
116
+ if self.kernel_size is not None:
117
+ return x, self.pool(x)
118
+ else:
119
+ return x
120
+
121
+
122
+ class Intermediate(nn.Module): #
123
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
124
+ super(Intermediate, self).__init__()
125
+ self.n_inters = n_inters
126
+ self.layers = nn.ModuleList()
127
+ self.layers.append(
128
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
129
+ )
130
+ for i in range(self.n_inters - 1):
131
+ self.layers.append(
132
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
133
+ )
134
+
135
+ def forward(self, x):
136
+ for i in range(self.n_inters):
137
+ x = self.layers[i](x)
138
+ return x
139
+
140
+
141
+ class ResDecoderBlock(nn.Module):
142
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
143
+ super(ResDecoderBlock, self).__init__()
144
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
145
+ self.n_blocks = n_blocks
146
+ self.conv1 = nn.Sequential(
147
+ nn.ConvTranspose2d(
148
+ in_channels=in_channels,
149
+ out_channels=out_channels,
150
+ kernel_size=(3, 3),
151
+ stride=stride,
152
+ padding=(1, 1),
153
+ output_padding=out_padding,
154
+ bias=False,
155
+ ),
156
+ nn.BatchNorm2d(out_channels, momentum=momentum),
157
+ nn.ReLU(),
158
+ )
159
+ self.conv2 = nn.ModuleList()
160
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
161
+ for i in range(n_blocks - 1):
162
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
163
+
164
+ def forward(self, x, concat_tensor):
165
+ x = self.conv1(x)
166
+ x = torch.cat((x, concat_tensor), dim=1)
167
+ for i in range(self.n_blocks):
168
+ x = self.conv2[i](x)
169
+ return x
170
+
171
+
172
+ class Decoder(nn.Module):
173
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
174
+ super(Decoder, self).__init__()
175
+ self.layers = nn.ModuleList()
176
+ self.n_decoders = n_decoders
177
+ for i in range(self.n_decoders):
178
+ out_channels = in_channels // 2
179
+ self.layers.append(
180
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
181
+ )
182
+ in_channels = out_channels
183
+
184
+ def forward(self, x, concat_tensors):
185
+ for i in range(self.n_decoders):
186
+ x = self.layers[i](x, concat_tensors[-1 - i])
187
+ return x
188
+
189
+
190
+ class DeepUnet(nn.Module):
191
+ def __init__(
192
+ self,
193
+ kernel_size,
194
+ n_blocks,
195
+ en_de_layers=5,
196
+ inter_layers=4,
197
+ in_channels=1,
198
+ en_out_channels=16,
199
+ ):
200
+ super(DeepUnet, self).__init__()
201
+ self.encoder = Encoder(
202
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
203
+ )
204
+ self.intermediate = Intermediate(
205
+ self.encoder.out_channel // 2,
206
+ self.encoder.out_channel,
207
+ inter_layers,
208
+ n_blocks,
209
+ )
210
+ self.decoder = Decoder(
211
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
212
+ )
213
+
214
+ def forward(self, x):
215
+ x, concat_tensors = self.encoder(x)
216
+ x = self.intermediate(x)
217
+ x = self.decoder(x, concat_tensors)
218
+ return x
219
+
220
+
221
+ class E2E(nn.Module):
222
+ def __init__(
223
+ self,
224
+ n_blocks,
225
+ n_gru,
226
+ kernel_size,
227
+ en_de_layers=5,
228
+ inter_layers=4,
229
+ in_channels=1,
230
+ en_out_channels=16,
231
+ ):
232
+ super(E2E, self).__init__()
233
+ self.unet = DeepUnet(
234
+ kernel_size,
235
+ n_blocks,
236
+ en_de_layers,
237
+ inter_layers,
238
+ in_channels,
239
+ en_out_channels,
240
+ )
241
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
242
+ if n_gru:
243
+ self.fc = nn.Sequential(
244
+ BiGRU(3 * 128, 256, n_gru),
245
+ nn.Linear(512, 360),
246
+ nn.Dropout(0.25),
247
+ nn.Sigmoid(),
248
+ )
249
+ else:
250
+ self.fc = nn.Sequential(
251
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
252
+ )
253
+
254
+ def forward(self, mel):
255
+ mel = mel.transpose(-1, -2).unsqueeze(1)
256
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
257
+ x = self.fc(x)
258
+ return x
259
+
260
+
261
+ class MelSpectrogram(torch.nn.Module):
262
+ def __init__(
263
+ self,
264
+ is_half,
265
+ n_mel_channels,
266
+ sampling_rate,
267
+ win_length,
268
+ hop_length,
269
+ n_fft=None,
270
+ mel_fmin=0,
271
+ mel_fmax=None,
272
+ clamp=1e-5,
273
+ ):
274
+ super().__init__()
275
+ n_fft = win_length if n_fft is None else n_fft
276
+ self.hann_window = {}
277
+ mel_basis = mel(
278
+ sr=sampling_rate,
279
+ n_fft=n_fft,
280
+ n_mels=n_mel_channels,
281
+ fmin=mel_fmin,
282
+ fmax=mel_fmax,
283
+ htk=True,
284
+ )
285
+ mel_basis = torch.from_numpy(mel_basis).float()
286
+ self.register_buffer("mel_basis", mel_basis)
287
+ self.n_fft = win_length if n_fft is None else n_fft
288
+ self.hop_length = hop_length
289
+ self.win_length = win_length
290
+ self.sampling_rate = sampling_rate
291
+ self.n_mel_channels = n_mel_channels
292
+ self.clamp = clamp
293
+ self.is_half = is_half
294
+
295
+ def forward(self, audio, keyshift=0, speed=1, center=True):
296
+ factor = 2 ** (keyshift / 12)
297
+ n_fft_new = int(np.round(self.n_fft * factor))
298
+ win_length_new = int(np.round(self.win_length * factor))
299
+ hop_length_new = int(np.round(self.hop_length * speed))
300
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
301
+ if keyshift_key not in self.hann_window:
302
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
303
+ audio.device
304
+ )
305
+ fft = torch.stft(
306
+ audio,
307
+ n_fft=n_fft_new,
308
+ hop_length=hop_length_new,
309
+ win_length=win_length_new,
310
+ window=self.hann_window[keyshift_key],
311
+ center=center,
312
+ return_complex=True,
313
+ )
314
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
315
+ if keyshift != 0:
316
+ size = self.n_fft // 2 + 1
317
+ resize = magnitude.size(1)
318
+ if resize < size:
319
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
320
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
321
+ mel_output = torch.matmul(self.mel_basis, magnitude)
322
+ if self.is_half == True:
323
+ mel_output = mel_output.half()
324
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
325
+ return log_mel_spec
326
+
327
+
328
+ class RMVPE:
329
+ def __init__(self, model_path, is_half, device=None):
330
+ self.resample_kernel = {}
331
+ model = E2E(4, 1, (2, 2))
332
+ ckpt = torch.load(model_path, map_location="cpu")
333
+ model.load_state_dict(ckpt)
334
+ model.eval()
335
+ if is_half == True:
336
+ model = model.half()
337
+ self.model = model
338
+ self.resample_kernel = {}
339
+ self.is_half = is_half
340
+ if device is None:
341
+ device = "cuda" if torch.cuda.is_available() else "cpu"
342
+ self.device = device
343
+ self.mel_extractor = MelSpectrogram(
344
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
345
+ ).to(device)
346
+ self.model = self.model.to(device)
347
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
348
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
349
+
350
+ def mel2hidden(self, mel):
351
+ with torch.no_grad():
352
+ n_frames = mel.shape[-1]
353
+ mel = F.pad(
354
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
355
+ )
356
+ hidden = self.model(mel)
357
+ return hidden[:, :n_frames]
358
+
359
+ def decode(self, hidden, thred=0.03):
360
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
361
+ f0 = 10 * (2 ** (cents_pred / 1200))
362
+ f0[f0 == 10] = 0
363
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
364
+ return f0
365
+
366
+ def infer_from_audio(self, audio, thred=0.03):
367
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
368
+ # torch.cuda.synchronize()
369
+ # t0=ttime()
370
+ mel = self.mel_extractor(audio, center=True)
371
+ # torch.cuda.synchronize()
372
+ # t1=ttime()
373
+ hidden = self.mel2hidden(mel)
374
+ # torch.cuda.synchronize()
375
+ # t2=ttime()
376
+ hidden = hidden.squeeze(0).cpu().numpy()
377
+ if self.is_half == True:
378
+ hidden = hidden.astype("float32")
379
+ f0 = self.decode(hidden, thred=thred)
380
+ # torch.cuda.synchronize()
381
+ # t3=ttime()
382
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
383
+ return f0
384
+
385
+ def to_local_average_cents(self, salience, thred=0.05):
386
+ # t0 = ttime()
387
+ center = np.argmax(salience, axis=1) # 帧长#index
388
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
389
+ # t1 = ttime()
390
+ center += 4
391
+ todo_salience = []
392
+ todo_cents_mapping = []
393
+ starts = center - 4
394
+ ends = center + 5
395
+ for idx in range(salience.shape[0]):
396
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
397
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
398
+ # t2 = ttime()
399
+ todo_salience = np.array(todo_salience) # 帧长,9
400
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
401
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
402
+ weight_sum = np.sum(todo_salience, 1) # 帧长
403
+ devided = product_sum / weight_sum # 帧长
404
+ # t3 = ttime()
405
+ maxx = np.max(salience, axis=1) # 帧长
406
+ devided[maxx <= thred] = 0
407
+ # t4 = ttime()
408
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
409
+ return devided
src/vc/rvc.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ from typings.extra import F0Method
3
+ from multiprocessing import cpu_count
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from fairseq import checkpoint_utils
8
+ from scipy.io import wavfile
9
+
10
+ from vc.infer_pack.models import (
11
+ SynthesizerTrnMs256NSFsid,
12
+ SynthesizerTrnMs256NSFsid_nono,
13
+ SynthesizerTrnMs768NSFsid,
14
+ SynthesizerTrnMs768NSFsid_nono,
15
+ )
16
+ from vc.my_utils import load_audio
17
+ from vc.vc_infer_pipeline import VC
18
+
19
+ SRC_DIR = Path(__file__).resolve().parent.parent
20
+
21
+
22
+ class Config:
23
+ def __init__(self, device, is_half):
24
+ self.device = device
25
+ self.is_half = is_half
26
+ self.n_cpu = 0
27
+ self.gpu_name = None
28
+ self.gpu_mem = None
29
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
30
+
31
+ def device_config(self) -> tuple:
32
+ if torch.cuda.is_available():
33
+ i_device = int(self.device.split(":")[-1])
34
+ self.gpu_name = torch.cuda.get_device_name(i_device)
35
+ if (
36
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
37
+ or "P40" in self.gpu_name.upper()
38
+ or "1060" in self.gpu_name
39
+ or "1070" in self.gpu_name
40
+ or "1080" in self.gpu_name
41
+ ):
42
+ print("16 series/10 series P40 forced single precision")
43
+ self.is_half = False
44
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
45
+ with open(SRC_DIR / "vc" / "configs" / config_file, "r") as f:
46
+ strr = f.read().replace("true", "false")
47
+ with open(SRC_DIR / "vc" / "configs" / config_file, "w") as f:
48
+ f.write(strr)
49
+ with open(
50
+ SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "r"
51
+ ) as f:
52
+ strr = f.read().replace("3.7", "3.0")
53
+ with open(
54
+ SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "w"
55
+ ) as f:
56
+ f.write(strr)
57
+ else:
58
+ self.gpu_name = None
59
+ self.gpu_mem = int(
60
+ torch.cuda.get_device_properties(i_device).total_memory
61
+ / 1024
62
+ / 1024
63
+ / 1024
64
+ + 0.4
65
+ )
66
+ if self.gpu_mem <= 4:
67
+ with open(
68
+ SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "r"
69
+ ) as f:
70
+ strr = f.read().replace("3.7", "3.0")
71
+ with open(
72
+ SRC_DIR / "vc" / "trainset_preprocess_pipeline_print.py", "w"
73
+ ) as f:
74
+ f.write(strr)
75
+ elif torch.backends.mps.is_available():
76
+ print("No supported N-card found, use MPS for inference")
77
+ self.device = "mps"
78
+ else:
79
+ print("No supported N-card found, use CPU for inference")
80
+ self.device = "cpu"
81
+ self.is_half = True
82
+
83
+ if self.n_cpu == 0:
84
+ self.n_cpu = cpu_count()
85
+
86
+ if self.is_half:
87
+ # 6G memory config
88
+ x_pad = 3
89
+ x_query = 10
90
+ x_center = 60
91
+ x_max = 65
92
+ else:
93
+ # 5G memory config
94
+ x_pad = 1
95
+ x_query = 6
96
+ x_center = 38
97
+ x_max = 41
98
+
99
+ if self.gpu_mem != None and self.gpu_mem <= 4:
100
+ x_pad = 1
101
+ x_query = 5
102
+ x_center = 30
103
+ x_max = 32
104
+
105
+ return x_pad, x_query, x_center, x_max
106
+
107
+
108
+ def load_hubert(device: str, is_half: bool, model_path: str) -> torch.nn.Module:
109
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
110
+ [model_path],
111
+ suffix="",
112
+ )
113
+ hubert = models[0]
114
+ hubert = hubert.to(device)
115
+
116
+ if is_half:
117
+ hubert = hubert.half()
118
+ else:
119
+ hubert = hubert.float()
120
+
121
+ hubert.eval()
122
+ return hubert
123
+
124
+
125
+ def get_vc(
126
+ device: str, is_half: bool, config: Config, model_path: str
127
+ ) -> tuple[dict[str, Any], str, torch.nn.Module, int, VC]:
128
+ cpt = torch.load(model_path, map_location="cpu")
129
+ if "config" not in cpt or "weight" not in cpt:
130
+ raise ValueError(
131
+ f"Incorrect format for {model_path}. Use a voice model trained using RVC v2 instead."
132
+ )
133
+
134
+ tgt_sr = cpt["config"][-1]
135
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
136
+ if_f0 = cpt.get("f0", 1)
137
+ version = cpt.get("version", "v1")
138
+
139
+ if version == "v1":
140
+ if if_f0 == 1:
141
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half)
142
+ else:
143
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
144
+ elif version == "v2":
145
+ if if_f0 == 1:
146
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half)
147
+ else:
148
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
149
+
150
+ del net_g.enc_q
151
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
152
+ net_g.eval().to(device)
153
+
154
+ if is_half:
155
+ net_g = net_g.half()
156
+ else:
157
+ net_g = net_g.float()
158
+
159
+ vc = VC(tgt_sr, config)
160
+ return cpt, version, net_g, tgt_sr, vc
161
+
162
+
163
+ def rvc_infer(
164
+ index_path: str,
165
+ index_rate: float,
166
+ input_path: str,
167
+ output_path: str,
168
+ pitch_change: int,
169
+ f0_method: F0Method,
170
+ cpt: dict[str, Any],
171
+ version: str,
172
+ net_g: torch.nn.Module,
173
+ filter_radius: int,
174
+ tgt_sr: int,
175
+ rms_mix_rate: float,
176
+ protect: float,
177
+ crepe_hop_length: int,
178
+ vc: VC,
179
+ hubert_model: torch.nn.Module,
180
+ resample_sr: int,
181
+ ) -> None:
182
+ audio = load_audio(input_path, 16000)
183
+ times = [0, 0, 0]
184
+ if_f0 = cpt.get("f0", 1)
185
+ audio_opt, output_sr = vc.pipeline(
186
+ hubert_model,
187
+ net_g,
188
+ 0,
189
+ audio,
190
+ input_path,
191
+ times,
192
+ pitch_change,
193
+ f0_method,
194
+ index_path,
195
+ index_rate,
196
+ if_f0,
197
+ filter_radius,
198
+ tgt_sr,
199
+ resample_sr,
200
+ rms_mix_rate,
201
+ version,
202
+ protect,
203
+ crepe_hop_length,
204
+ )
205
+ wavfile.write(output_path, output_sr, audio_opt)
src/vc/trainset_preprocess_pipeline_print.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, multiprocessing
2
+ from scipy import signal
3
+
4
+ now_dir = os.getcwd()
5
+ sys.path.append(now_dir)
6
+
7
+ inp_root = sys.argv[1]
8
+ sr = int(sys.argv[2])
9
+ n_p = int(sys.argv[3])
10
+ exp_dir = sys.argv[4]
11
+ noparallel = sys.argv[5] == "True"
12
+ import numpy as np, os, traceback
13
+ from slicer2 import Slicer
14
+ import librosa, traceback
15
+ from scipy.io import wavfile
16
+ import multiprocessing
17
+ from vc.my_utils import load_audio
18
+ import tqdm
19
+
20
+ DoFormant = False
21
+ Quefrency = 1.0
22
+ Timbre = 1.0
23
+
24
+ mutex = multiprocessing.Lock()
25
+ f = open("%s/preprocess.log" % exp_dir, "a+")
26
+
27
+
28
+ def println(strr):
29
+ mutex.acquire()
30
+ print(strr)
31
+ f.write("%s\n" % strr)
32
+ f.flush()
33
+ mutex.release()
34
+
35
+
36
+ class PreProcess:
37
+ def __init__(self, sr, exp_dir):
38
+ self.slicer = Slicer(
39
+ sr=sr,
40
+ threshold=-42,
41
+ min_length=1500,
42
+ min_interval=400,
43
+ hop_size=15,
44
+ max_sil_kept=500,
45
+ )
46
+ self.sr = sr
47
+ self.bh, self.ah = signal.butter(N=5, Wn=48, btype="high", fs=self.sr)
48
+ self.per = 3.0
49
+ self.overlap = 0.3
50
+ self.tail = self.per + self.overlap
51
+ self.max = 0.9
52
+ self.alpha = 0.75
53
+ self.exp_dir = exp_dir
54
+ self.gt_wavs_dir = "%s/0_gt_wavs" % exp_dir
55
+ self.wavs16k_dir = "%s/1_16k_wavs" % exp_dir
56
+ os.makedirs(self.exp_dir, exist_ok=True)
57
+ os.makedirs(self.gt_wavs_dir, exist_ok=True)
58
+ os.makedirs(self.wavs16k_dir, exist_ok=True)
59
+
60
+ def norm_write(self, tmp_audio, idx0, idx1):
61
+ tmp_max = np.abs(tmp_audio).max()
62
+ if tmp_max > 2.5:
63
+ print("%s-%s-%s-filtered" % (idx0, idx1, tmp_max))
64
+ return
65
+ tmp_audio = (tmp_audio / tmp_max * (self.max * self.alpha)) + (
66
+ 1 - self.alpha
67
+ ) * tmp_audio
68
+ wavfile.write(
69
+ "%s/%s_%s.wav" % (self.gt_wavs_dir, idx0, idx1),
70
+ self.sr,
71
+ tmp_audio.astype(np.float32),
72
+ )
73
+ tmp_audio = librosa.resample(
74
+ tmp_audio, orig_sr=self.sr, target_sr=16000
75
+ ) # , res_type="soxr_vhq"
76
+ wavfile.write(
77
+ "%s/%s_%s.wav" % (self.wavs16k_dir, idx0, idx1),
78
+ 16000,
79
+ tmp_audio.astype(np.float32),
80
+ )
81
+
82
+ def pipeline(self, path, idx0):
83
+ try:
84
+ audio = load_audio(path, self.sr, DoFormant, Quefrency, Timbre)
85
+ # zero phased digital filter cause pre-ringing noise...
86
+ # audio = signal.filtfilt(self.bh, self.ah, audio)
87
+ audio = signal.lfilter(self.bh, self.ah, audio)
88
+
89
+ idx1 = 0
90
+ for audio in self.slicer.slice(audio):
91
+ i = 0
92
+ while 1:
93
+ start = int(self.sr * (self.per - self.overlap) * i)
94
+ i += 1
95
+ if len(audio[start:]) > self.tail * self.sr:
96
+ tmp_audio = audio[start : start + int(self.per * self.sr)]
97
+ self.norm_write(tmp_audio, idx0, idx1)
98
+ idx1 += 1
99
+ else:
100
+ tmp_audio = audio[start:]
101
+ idx1 += 1
102
+ break
103
+ self.norm_write(tmp_audio, idx0, idx1)
104
+ # println("%s->Suc." % path)
105
+ except:
106
+ println("%s->%s" % (path, traceback.format_exc()))
107
+
108
+ def pipeline_mp(self, infos, thread_n):
109
+ for path, idx0 in tqdm.tqdm(
110
+ infos, position=thread_n, leave=True, desc="thread:%s" % thread_n
111
+ ):
112
+ self.pipeline(path, idx0)
113
+
114
+ def pipeline_mp_inp_dir(self, inp_root, n_p):
115
+ try:
116
+ infos = [
117
+ ("%s/%s" % (inp_root, name), idx)
118
+ for idx, name in enumerate(sorted(list(os.listdir(inp_root))))
119
+ ]
120
+ if noparallel:
121
+ for i in range(n_p):
122
+ self.pipeline_mp(infos[i::n_p])
123
+ else:
124
+ ps = []
125
+ for i in range(n_p):
126
+ p = multiprocessing.Process(
127
+ target=self.pipeline_mp, args=(infos[i::n_p], i)
128
+ )
129
+ ps.append(p)
130
+ p.start()
131
+ for i in range(n_p):
132
+ ps[i].join()
133
+ except:
134
+ println("Fail. %s" % traceback.format_exc())
135
+
136
+
137
+ def preprocess_trainset(inp_root, sr, n_p, exp_dir):
138
+ pp = PreProcess(sr, exp_dir)
139
+ println("start preprocess")
140
+ println(sys.argv)
141
+ pp.pipeline_mp_inp_dir(inp_root, n_p)
142
+ println("end preprocess")
143
+
144
+
145
+ if __name__ == "__main__":
146
+ preprocess_trainset(inp_root, sr, n_p, exp_dir)