Spaces:
Sleeping
Sleeping
File size: 3,085 Bytes
8acab6d 94dcd08 8acab6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain import PromptTemplate, LLMChain, OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
def handwriting_to_text(url):
model_1 = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
output_1 = model_1(url)
return output_1
def generate_story(scenario):
template = """
Consider yourself as the famous poet "William Shakespere";
You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;
CONTEXT: {scenario}
POEM:
"""
prompt = PromptTemplate(template=template, input_variables=["scenario"])
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
story = story_llm.predict(scenario=scenario)
print(story)
return story
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
return "speech.wav"
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
with open("speech.wav", "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
def main_model(image):
image = Image.fromarray(np.uint8(image))
image_path = "temp_image.png"
image.save(image_path)
text = handwriting_to_text(image_path)
poem = generate_story(text)
audio_data = recite_the_poem(poem)
return poem, audio_data
iface = gr.Interface(
fn=main_model,
inputs="image",
outputs=["text", "audio"],
title="Flying Shakespeare",
description="Upload the image generated from Model-P101/M101.",
)
if __name__ == "__main__":
iface.launch(share=True) |