Spaces:
Sleeping
Sleeping
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from langchain.prompts import PromptTemplate | |
from langchain.chains import LLMChain | |
from langchain.llms import OpenAI | |
import requests | |
import os | |
import io | |
from datasets import load_dataset | |
import torch | |
import soundfile as sf | |
import gradio as gr | |
from PIL import Image | |
import numpy as np | |
from dotenv import load_dotenv, find_dotenv | |
load_dotenv(find_dotenv()) | |
def handwriting_to_text(image): | |
API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten" | |
headers = {"Authorization": "Bearer "} | |
with open(image, "rb") as f: | |
data = f.read() | |
response = requests.post(API_URL, headers=headers, data=data) | |
return response.json() | |
def generate_story(scenario): | |
template = """ | |
Consider yourself as the famous poet "William Shakespere"; | |
You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length; | |
CONTEXT: {scenario} | |
POEM: | |
""" | |
prompt = PromptTemplate(template=template, input_variables=["scenario"]) | |
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True) | |
story = story_llm.predict(scenario=scenario) | |
print(story) | |
return story | |
def recite_the_poem(content): | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
inputs = processor(text=content, return_tensors="pt") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
sf.write("speech.wav", speech.numpy(), samplerate=16000) | |
with open("speech.wav", "rb") as audio_file: | |
audio_data = audio_file.read() | |
return audio_data | |
def main_model(image): | |
image = Image.fromarray(np.uint8(image)) | |
image_path = "temp_image.png" | |
image.save(image_path) | |
text = handwriting_to_text(image_path) | |
poem = generate_story(text) | |
audio_data = recite_the_poem(poem) | |
return poem, audio_data | |
iface = gr.Interface( | |
fn=main_model, | |
inputs="image", | |
outputs=["text", "audio"], | |
title="Flying Shakespeare", | |
description="Upload the image generated from the Model:O101-M101/2", | |
) | |
if __name__ == "__main__": | |
iface.launch() |