File size: 8,837 Bytes
916d940
 
 
 
 
 
 
 
 
 
 
 
b43c1b7
7c5f1d9
c7880a2
 
 
6b1e6f7
eb5d22a
6b1e6f7
c7880a2
 
cdc31ba
c7880a2
916d940
 
 
9648631
916d940
 
 
9648631
916d940
 
9648631
916d940
 
 
9648631
916d940
 
 
 
9648631
916d940
 
 
 
 
 
 
 
 
825c701
916d940
9648631
 
 
 
 
916d940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8542c64
916d940
 
 
 
9648631
 
 
 
916d940
9648631
916d940
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7880a2
9648631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ada31bd
916d940
 
 
 
9648631
916d940
 
 
 
 
 
 
9648631
 
 
916d940
 
9648631
a0b80e1
c7880a2
9648631
a0b80e1
 
 
 
 
 
a6b4595
b43c1b7
aec945b
 
 
 
 
916d940
9648631
 
 
c7880a2
825c701
916d940
99c91fc
 
916d940
 
99c91fc
2072f15
99c91fc
 
1cdcd10
916d940
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import whisper
import gradio as gr 
from keybert import KeyBERT
import random as r
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch
from PIL import Image
import time
import matplotlib.pyplot as plt
import numpy as np
import PIL



model = whisper.load_model("base")
model.device

model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"

# model_id = "TaiMingLu/diffusion-architecture"

scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16 #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

def transcribe(audio,prompt_num,user_keywords):
    
    # load audio and pad/trim it to fit 30 seconds
    audio1 = whisper.load_audio(audio)
    audio1 = whisper.pad_or_trim(audio1)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio1).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)
    print(f"Detected language: {max(probs, key=probs.get)}")

    # decode the audio
    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    print(result.text)

    # model = whisper.load_model("base")
    audio2 = whisper.load_audio(audio)
    final_result = model.transcribe(audio2)
    print(final_result["text"])
    return final_result["text"],int(prompt_num),user_keywords



def keywords(text,prompt_num,user_keywords):

    transcription = text

    # ub = UrlBuilder("demo.imgix.net")




    kw_model = KeyBERT()
    a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
    set_1 = [i[0] for i in a]
    b = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
                                use_maxsum=True, nr_candidates=20, top_n=5)
    set_2 = [i[0] for i in b]
    c = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
                                use_mmr=True, diversity=0.7)
    set_3 = [i[0] for i in c]
    d = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
                                use_mmr=True, diversity=0.2)
    set_4 = [i[0] for i in d]
    keyword_pool = set_1 + set_2 + set_3 + set_4
    print("keywords: ", keyword_pool, "length: ", len(keyword_pool))

    generated_prompts = []

    count = 0

    while count != int(prompt_num):

        sentence = []

        style_prompts = ["perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting", "detailed, colourful, unreal engine, octane render, blender effect", "70mm, Canon EOS 6D Mark II, 4k, 35mm (FX, Full-Frame), f/2.5, extremely detailed, very high details, photorealistic, hi res, hdr, UHD, hyper-detailed, ultra-realistic, vibrant, centered, vivid colors, Wide angle, zoom out",  "detailed, soft ambiance, japanese influence, unreal engine 5, octane render", "perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting --v 4"]
        
        my_list = user_keywords.split(',')
        print(my_list)

        # for i in range(len(my_list)):
        #   sentence.append(my_list[i])
        
        # numb = 5
        for i in range(len(my_list)):
          # print("keyword_pool",keyword_pool, len(keyword_pool))

          sentence.append("mdjrny-v4 style")
          
          for i in range (len(my_list)):
            sentence.append(my_list[i])

          rand_1 = r.randint(1, 4)

          if rand_1 == 1:
              sentence.append(r.choice(set_1))
              sentence.append(r.choice(set_1))
              sentence.append(r.choice(set_2))
              sentence.append(r.choice(set_3))
              sentence.append(r.choice(set_4))
          elif rand_1 == 2:
              sentence.append(r.choice(set_2))
              sentence.append(r.choice(set_2))
              sentence.append(r.choice(set_1))
              sentence.append(r.choice(set_3))
              sentence.append(r.choice(set_4))
          elif rand_1 == 3:
              sentence.append(r.choice(set_3))
              sentence.append(r.choice(set_3))
              sentence.append(r.choice(set_1))
              sentence.append(r.choice(set_2))
              sentence.append(r.choice(set_4))
          else:
              sentence.append(r.choice(set_4))
              sentence.append(r.choice(set_4))
              sentence.append(r.choice(set_1))
              sentence.append(r.choice(set_2))
              sentence.append(r.choice(set_3))


          # rand1 = r.randint(0,numb)

          # rand2 = r.randint(0,numb)
          # if rand2 == rand1:
          #   rand2 = r.randint(0,numb)

          # rand3 = r.randint(0,numb)
          # if rand3 == rand1 or rand3 == rand2:
          #   rand3 = r.randint(0,numb)

          # rand4 = r.randint(0,numb)
          # if rand4 == rand1 or rand4 == rand2 or rand4 == rand3:  
          #   rand4 = r.randint(0,numb)

          # word_1 = keyword_pool[rand1]
          # word_2 = keyword_pool[rand2]
          # word_3 = keyword_pool[rand3]
          # word_4 = keyword_pool[rand4]

          # sentence.append(word_1 +", "+ word_2+", " + word_3+", " + word_4)


                  ## Add Style Tail Prompt
          sentence.append(r.choice(style_prompts))

          print("sentence: ", sentence)

          # Formatting Data as comma-delimited for Mid Journey
          myprompt = ', '.join(str(e) for e in sentence)
          sentence = []
          print("prompt: ",myprompt)
          generated_prompts.append(myprompt)

          count += 1

    print("no. of prompts: ", len(generated_prompts))
    print("generated prompts: ", generated_prompts)

    count = 0
    images = []
    # np_images = []
    print("works1")
    while count != int(len(generated_prompts)):

        print("works2")

        for i in generated_prompts:
            print("works3")
            count += 1
            print(i)
            print("works4")
            torch.cuda.empty_cache()
            # with torch.autocast("cuda"):
            image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
            print("works5")
            images.append(image)
            print("works6")
    
    # min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
    # imgs_comb = np.hstack([i.resize(min_shape) for i in images])
    # imgs_comb = Image.fromarray( imgs_comb)
    
    return images,transcription,keyword_pool,generated_prompts

#speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
#text_prompts = gr.Interface(fn=keywords, title = 'Speech-to-Image-Generator', inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")], theme='darkhuggingface', enable_queue=True)


speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated image(s)", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)

#gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue()
gr.Series(speech_text,text_prompts).launch(enable_queue=True,share=False).queue()