Spaces:
Runtime error
Runtime error
File size: 8,837 Bytes
916d940 b43c1b7 7c5f1d9 c7880a2 6b1e6f7 eb5d22a 6b1e6f7 c7880a2 cdc31ba c7880a2 916d940 9648631 916d940 9648631 916d940 9648631 916d940 9648631 916d940 9648631 916d940 825c701 916d940 9648631 916d940 8542c64 916d940 9648631 916d940 9648631 916d940 c7880a2 9648631 ada31bd 916d940 9648631 916d940 9648631 916d940 9648631 a0b80e1 c7880a2 9648631 a0b80e1 a6b4595 b43c1b7 aec945b 916d940 9648631 c7880a2 825c701 916d940 99c91fc 916d940 99c91fc 2072f15 99c91fc 1cdcd10 916d940 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import whisper
import gradio as gr
from keybert import KeyBERT
import random as r
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch
from PIL import Image
import time
import matplotlib.pyplot as plt
import numpy as np
import PIL
model = whisper.load_model("base")
model.device
model_id = 'prompthero/midjourney-v4-diffusion' #"stabilityai/stable-diffusion-2"
# model_id = "TaiMingLu/diffusion-architecture"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) #pipe = StableDiffusionPipeline.from_pretrained(model_id , torch_dtype=torch.float16 #pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, revision="fp16", torch_dtype=torch.float16)
pipe = pipe.to("cuda")
def transcribe(audio,prompt_num,user_keywords):
# load audio and pad/trim it to fit 30 seconds
audio1 = whisper.load_audio(audio)
audio1 = whisper.pad_or_trim(audio1)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio1).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
print(result.text)
# model = whisper.load_model("base")
audio2 = whisper.load_audio(audio)
final_result = model.transcribe(audio2)
print(final_result["text"])
return final_result["text"],int(prompt_num),user_keywords
def keywords(text,prompt_num,user_keywords):
transcription = text
# ub = UrlBuilder("demo.imgix.net")
kw_model = KeyBERT()
a = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words=None)
set_1 = [i[0] for i in a]
b = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
use_maxsum=True, nr_candidates=20, top_n=5)
set_2 = [i[0] for i in b]
c = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
use_mmr=True, diversity=0.7)
set_3 = [i[0] for i in c]
d = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 3), stop_words='english',
use_mmr=True, diversity=0.2)
set_4 = [i[0] for i in d]
keyword_pool = set_1 + set_2 + set_3 + set_4
print("keywords: ", keyword_pool, "length: ", len(keyword_pool))
generated_prompts = []
count = 0
while count != int(prompt_num):
sentence = []
style_prompts = ["perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting", "detailed, colourful, unreal engine, octane render, blender effect", "70mm, Canon EOS 6D Mark II, 4k, 35mm (FX, Full-Frame), f/2.5, extremely detailed, very high details, photorealistic, hi res, hdr, UHD, hyper-detailed, ultra-realistic, vibrant, centered, vivid colors, Wide angle, zoom out", "detailed, soft ambiance, japanese influence, unreal engine 5, octane render", "perfect shading, soft studio lighting, ultra-realistic, photorealistic, octane render, cinematic lighting, hdr, in-frame, 4k, 8k, edge lighting --v 4"]
my_list = user_keywords.split(',')
print(my_list)
# for i in range(len(my_list)):
# sentence.append(my_list[i])
# numb = 5
for i in range(len(my_list)):
# print("keyword_pool",keyword_pool, len(keyword_pool))
sentence.append("mdjrny-v4 style")
for i in range (len(my_list)):
sentence.append(my_list[i])
rand_1 = r.randint(1, 4)
if rand_1 == 1:
sentence.append(r.choice(set_1))
sentence.append(r.choice(set_1))
sentence.append(r.choice(set_2))
sentence.append(r.choice(set_3))
sentence.append(r.choice(set_4))
elif rand_1 == 2:
sentence.append(r.choice(set_2))
sentence.append(r.choice(set_2))
sentence.append(r.choice(set_1))
sentence.append(r.choice(set_3))
sentence.append(r.choice(set_4))
elif rand_1 == 3:
sentence.append(r.choice(set_3))
sentence.append(r.choice(set_3))
sentence.append(r.choice(set_1))
sentence.append(r.choice(set_2))
sentence.append(r.choice(set_4))
else:
sentence.append(r.choice(set_4))
sentence.append(r.choice(set_4))
sentence.append(r.choice(set_1))
sentence.append(r.choice(set_2))
sentence.append(r.choice(set_3))
# rand1 = r.randint(0,numb)
# rand2 = r.randint(0,numb)
# if rand2 == rand1:
# rand2 = r.randint(0,numb)
# rand3 = r.randint(0,numb)
# if rand3 == rand1 or rand3 == rand2:
# rand3 = r.randint(0,numb)
# rand4 = r.randint(0,numb)
# if rand4 == rand1 or rand4 == rand2 or rand4 == rand3:
# rand4 = r.randint(0,numb)
# word_1 = keyword_pool[rand1]
# word_2 = keyword_pool[rand2]
# word_3 = keyword_pool[rand3]
# word_4 = keyword_pool[rand4]
# sentence.append(word_1 +", "+ word_2+", " + word_3+", " + word_4)
## Add Style Tail Prompt
sentence.append(r.choice(style_prompts))
print("sentence: ", sentence)
# Formatting Data as comma-delimited for Mid Journey
myprompt = ', '.join(str(e) for e in sentence)
sentence = []
print("prompt: ",myprompt)
generated_prompts.append(myprompt)
count += 1
print("no. of prompts: ", len(generated_prompts))
print("generated prompts: ", generated_prompts)
count = 0
images = []
# np_images = []
print("works1")
while count != int(len(generated_prompts)):
print("works2")
for i in generated_prompts:
print("works3")
count += 1
print(i)
print("works4")
torch.cuda.empty_cache()
# with torch.autocast("cuda"):
image = pipe(i, height=768, width=768, guidance_scale = 10).images[0]
print("works5")
images.append(image)
print("works6")
# min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
# imgs_comb = np.hstack([i.resize(min_shape) for i in images])
# imgs_comb = Image.fromarray( imgs_comb)
return images,transcription,keyword_pool,generated_prompts
#speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], title = 'Speech to Image Generator', enable_queue=True)
#text_prompts = gr.Interface(fn=keywords, title = 'Speech-to-Image-Generator', inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated images", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")], theme='darkhuggingface', enable_queue=True)
speech_text = gr.Interface(fn=transcribe, inputs=[gr.Audio(source="microphone", type="filepath"),gr.Number(label = "Number of Images to be generated (int): "),gr.Textbox(label = "Additional keywords (comma delimitied): ")], outputs=["text","number","text"], theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
text_prompts = gr.Interface(fn=keywords, inputs=["text","number","text"], outputs=[gr.Gallery(label="Generated image(s)", show_label=True, elem_id="gallery").style(grid=[2], height="auto"),gr.TextArea(label="Transcription"),gr.TextArea(label="Keywords"),gr.TextArea(label="Generated Prompts")],theme = "darkhuggingface", title = 'Speech-to-Image-Generator', enable_queue=True)
#gr.Series(speech_text,text_prompts).launch(auth = ('PWuser','speechtotextPW'), auth_message = "Welcome to Perkins&Will i/o's Synthesia Tool. Use cases: Ideation/Brainstorming tool - Have it running in the background in a conference, brainstorming session, discussion to create contextually relevant visualizations for moodboarding, to spark more conversations, interactions and inspiration. | Aprameya Pandit | February 2023 | ",inline = False, enable_queue=True).queue()
gr.Series(speech_text,text_prompts).launch(enable_queue=True,share=False).queue()
|