ahmad4raza's picture
Create main.py
8acab6d
raw
history blame
3.16 kB
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain import PromptTemplate, LLMChain, OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
def handwriting_to_text(url):
model_1 = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
output_1 = model_1(url)
return output_1
def generate_story(scenario):
template = """
Consider yourself as the famous poet "William Shakespere";
You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;
CONTEXT: {scenario}
POEM:
"""
prompt = PromptTemplate(template=template, input_variables=["scenario"])
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
story = story_llm.predict(scenario=scenario)
print(story)
return story
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
return "speech.wav"
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
with open("speech.wav", "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
def main_model(image):
image = Image.fromarray(np.uint8(image))
image_path = "temp_image.png"
image.save(image_path)
text = handwriting_to_text(image_path)
poem = generate_story(text)
audio_data = recite_the_poem(poem)
return poem, audio_data
iface = gr.Interface(
fn=main_model,
inputs="image",
outputs=["text", "audio"],
title="Handwriting to Shakespearean Poem",
description="Upload an image containing handwritten text to generate a Shakespearean poem and play the recited poem.",
)
if __name__ == "__main__":
iface.launch(share=True)