from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from langchain import PromptTemplate, LLMChain, OpenAI import requests import os import io from datasets import load_dataset import torch import soundfile as sf import gradio as gr from PIL import Image import numpy as np import OpenAI from dotenv import load_dotenv, find_dotenv load_dotenv(find_dotenv()) def handwriting_to_text(image): API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten" headers = {"Authorization": "Bearer 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"} with open(image, "rb") as f: data = f.read() response = requests.post(API_URL, headers=headers, data=data) return response.json() def generate_story(scenario): template = """ Consider yourself as the famous poet "William Shakespere"; You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length; CONTEXT: {scenario} POEM: """ prompt = PromptTemplate(template=template, input_variables=["scenario"]) story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True) story = story_llm.predict(scenario=scenario) print(story) return story def recite_the_poem(content): processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") inputs = processor(text=content, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) sf.write("speech.wav", speech.numpy(), samplerate=16000) return "speech.wav" def recite_the_poem(content): processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") inputs = processor(text=content, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) sf.write("speech.wav", speech.numpy(), samplerate=16000) with open("speech.wav", "rb") as audio_file: audio_data = audio_file.read() return audio_data def main_model(image): image = Image.fromarray(np.uint8(image)) image_path = "temp_image.png" image.save(image_path) text = handwriting_to_text(image_path) poem = generate_story(text) audio_data = recite_the_poem(poem) return poem, audio_data iface = gr.Interface( fn=main_model, inputs="image", outputs=["text", "audio"], title="Flying Shakespeare", description="Upload the image generated from Model-P101/M101.", ) if __name__ == "__main__": iface.launch()