marioboy commited on
Commit
d2a588b
·
1 Parent(s): 229302e

feat: add universal approach for multiple models

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +14 -9
  3. demo_cli.py +4 -48
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ README.md
app.py CHANGED
@@ -3,18 +3,23 @@ import os
3
  import shlex
4
  import random
5
 
6
- os.system("megadl https://mega.nz/folder/7d4xUIIa#TnvmAWa5Av7QGo6gAuQj7g")
 
 
 
 
 
 
 
7
  os.system("ls")
8
 
9
 
10
  def inference(text):
11
  os.system("python demo_cli.py --no_sound --cpu --text " + shlex.quote(text.strip()))
12
- image_number = random.randint(2, len(os.listdir("pat_gifs/")))
13
- return [f"pat_gifs/{image_number}.gif", "demo_output_1.wav"]
14
 
15
 
16
- title = "Pat NES Punk's Voice"
17
- description = "<center> Text-to-speech engine with Pat Contri's voice. </center>"
18
  article = "<p style='text-align: center'>Based on <a href='https://matheo.uliege.be/handle/2268.2/6801' target='_blank'>Real-Time Voice Cloning</a> | <a href='https://github.com/CorentinJ/Real-Time-Voice-Cloning' target='_blank'>Github Repo</a></p>"
19
 
20
  examples = [
@@ -25,7 +30,7 @@ examples = [
25
  "My name is Samantha Morris. I'm the editor of an internet news magazine exploring news most media shy away from."
26
  ],
27
  [
28
- 'I have a morning ritual that I need to share. I call it "the terminator". First I crouch down in the shower in the classic "naked terminator traveling through time" pose.'
29
  ],
30
  [
31
  'With my eyes closed I crouch there for a minute, visualizing either Arnold or the guy from the second movie (not the chick in the third one because that one sucked) and I start to hum the terminator theme.'
@@ -44,12 +49,12 @@ gr.Interface(
44
  inference,
45
  inputs=["text"],
46
  outputs=[
47
- gr.Image(show_label=False, shape=(20, 20), value="pat_gifs/1.gif"),
48
  gr.outputs.Audio(type="file", label="Speech"),
49
  ],
50
  enable_queue=True,
51
- title=title,
52
- description=description,
53
  article=article,
54
  examples=examples
55
  ).launch()
 
3
  import shlex
4
  import random
5
 
6
+
7
+ LINK = os.environ.get('link')
8
+ ALIAS = os.environ.get('alias')
9
+ TITLE = os.environ.get('title')
10
+ DESCRIPTION = os.environ.get('description')
11
+
12
+
13
+ os.system(f"megadl {LINK}")
14
  os.system("ls")
15
 
16
 
17
  def inference(text):
18
  os.system("python demo_cli.py --no_sound --cpu --text " + shlex.quote(text.strip()))
19
+ image_number = random.randint(2, len(os.listdir(f"images/{ALIAS}/")))
20
+ return [f"images/{ALIAS}/{image_number}.gif", "demo_output_1.wav"]
21
 
22
 
 
 
23
  article = "<p style='text-align: center'>Based on <a href='https://matheo.uliege.be/handle/2268.2/6801' target='_blank'>Real-Time Voice Cloning</a> | <a href='https://github.com/CorentinJ/Real-Time-Voice-Cloning' target='_blank'>Github Repo</a></p>"
24
 
25
  examples = [
 
30
  "My name is Samantha Morris. I'm the editor of an internet news magazine exploring news most media shy away from."
31
  ],
32
  [
33
+ 'I have a morning ritual that I need to share. I call it - the terminator. First I crouch down in the shower in the classic naked terminator traveling through time pose.'
34
  ],
35
  [
36
  'With my eyes closed I crouch there for a minute, visualizing either Arnold or the guy from the second movie (not the chick in the third one because that one sucked) and I start to hum the terminator theme.'
 
49
  inference,
50
  inputs=["text"],
51
  outputs=[
52
+ gr.Image(show_label=False, shape=(20, 20), value=f"images/{ALIAS}/1.gif"),
53
  gr.outputs.Audio(type="file", label="Speech"),
54
  ],
55
  enable_queue=True,
56
+ title=TITLE,
57
+ description=DESCRIPTION,
58
  article=article,
59
  examples=examples
60
  ).launch()
demo_cli.py CHANGED
@@ -15,6 +15,8 @@ import os
15
  from audioread.exceptions import NoBackendError
16
  import pickle
17
 
 
 
18
  if __name__ == '__main__':
19
  ## Info & args
20
  parser = argparse.ArgumentParser(
@@ -78,51 +80,7 @@ if __name__ == '__main__':
78
  encoder.load_model(args.enc_model_fpath)
79
  synthesizer = Synthesizer(args.syn_model_fpath)
80
  vocoder.load_model(args.voc_model_fpath)
81
-
82
-
83
- ## Run a test
84
- # print("Testing your configuration with small inputs.")
85
- # # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
86
- # # sampling rate, which may differ.
87
- # # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
88
- # # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
89
- # # The sampling rate is the number of values (samples) recorded per second, it is set to
90
- # # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
91
- # # to an audio of 1 second.
92
- # print(" Testing the encoder...")
93
- # encoder.embed_utterance(np.zeros(encoder.sampling_rate))
94
-
95
- # # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
96
- # # returns, but here we're going to make one ourselves just for the sake of showing that it's
97
- # # possible.
98
- # embed = np.random.rand(speaker_embedding_size)
99
- # # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
100
- # # embeddings it will be).
101
- # embed /= np.linalg.norm(embed)
102
- # # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
103
- # # illustrate that
104
- # embeds = [embed, np.zeros(speaker_embedding_size)]
105
- # texts = ["test 1", "test 2"]
106
- # print(" Testing the synthesizer... (loading the model will output a lot of text)")
107
- # mels = synthesizer.synthesize_spectrograms(texts, embeds)
108
-
109
- # # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
110
- # # can concatenate the mel spectrograms to a single one.
111
- # mel = np.concatenate(mels, axis=1)
112
- # # The vocoder can take a callback function to display the generation. More on that later. For
113
- # # now we'll simply hide it like this:
114
- # no_action = lambda *args: None
115
- # print(" Testing the vocoder...")
116
- # # For the sake of making this test short, we'll pass a short target length. The target length
117
- # # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
118
- # # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
119
- # # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
120
- # # that has a detrimental effect on the quality of the audio. The default parameters are
121
- # # recommended in general.
122
- # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
123
-
124
- print("All test passed! You can now synthesize speech.\n\n")
125
-
126
 
127
  ## Interactive speech generation
128
  print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
@@ -142,9 +100,8 @@ if __name__ == '__main__':
142
  # The following two methods are equivalent:
143
  # - Directly load from the filepath:
144
 
145
- with open('pat.pickle', 'rb') as handle:
146
  preprocessed_wav = pickle.load(handle)
147
- # - If the wav is already loaded:
148
 
149
  print("Loaded file succesfully")
150
 
@@ -198,4 +155,3 @@ if __name__ == '__main__':
198
  print(generated_wav.dtype)
199
  sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
200
  print("\nSaved output as %s\n\n" % filename)
201
- print(os.environ)
 
15
  from audioread.exceptions import NoBackendError
16
  import pickle
17
 
18
+ ALIAS = os.environ.get('alias', 'breen')
19
+
20
  if __name__ == '__main__':
21
  ## Info & args
22
  parser = argparse.ArgumentParser(
 
80
  encoder.load_model(args.enc_model_fpath)
81
  synthesizer = Synthesizer(args.syn_model_fpath)
82
  vocoder.load_model(args.voc_model_fpath)
83
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  ## Interactive speech generation
86
  print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
 
100
  # The following two methods are equivalent:
101
  # - Directly load from the filepath:
102
 
103
+ with open(f'pickles/{ALIAS}.pickle', 'rb') as handle:
104
  preprocessed_wav = pickle.load(handle)
 
105
 
106
  print("Loaded file succesfully")
107
 
 
155
  print(generated_wav.dtype)
156
  sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
157
  print("\nSaved output as %s\n\n" % filename)