ahmad4raza's picture
Update app.py
79a2b62
raw
history blame
3.29 kB
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain import PromptTemplate, LLMChain, OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np
import OpenAI
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
def handwriting_to_text(image):
API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten"
headers = {"Authorization": "Bearer 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"}
with open(image, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()
def generate_story(scenario):
template = """
Consider yourself as the famous poet "William Shakespere";
You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;
CONTEXT: {scenario}
POEM:
"""
prompt = PromptTemplate(template=template, input_variables=["scenario"])
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
story = story_llm.predict(scenario=scenario)
print(story)
return story
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
return "speech.wav"
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
with open("speech.wav", "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
def main_model(image):
image = Image.fromarray(np.uint8(image))
image_path = "temp_image.png"
image.save(image_path)
text = handwriting_to_text(image_path)
poem = generate_story(text)
audio_data = recite_the_poem(poem)
return poem, audio_data
iface = gr.Interface(
fn=main_model,
inputs="image",
outputs=["text", "audio"],
title="Flying Shakespeare",
description="Upload the image generated from Model-P101/M101.",
)
if __name__ == "__main__":
iface.launch(share=True)