Spaces:
Sleeping
Sleeping
File size: 2,616 Bytes
8acab6d 46e71b6 8acab6d 18a1106 3db53e9 18a1106 8acab6d 94dcd08 f0b4393 8acab6d 6b9ee95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
def handwriting_to_text(image):
API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten"
headers = {"Authorization": "Bearer "}
with open(image, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()
def generate_story(scenario):
template = """
Consider yourself as the famous poet "William Shakespere";
You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;
CONTEXT: {scenario}
POEM:
"""
prompt = PromptTemplate(template=template, input_variables=["scenario"])
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
story = story_llm.predict(scenario=scenario)
print(story)
return story
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
with open("speech.wav", "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
def main_model(image):
image = Image.fromarray(np.uint8(image))
image_path = "temp_image.png"
image.save(image_path)
text = handwriting_to_text(image_path)
poem = generate_story(text)
audio_data = recite_the_poem(poem)
return poem, audio_data
iface = gr.Interface(
fn=main_model,
inputs="image",
outputs=["text", "audio"],
title="Flying Shakespeare",
description="Upload the image generated from the Model:O101-M101/2",
)
if __name__ == "__main__":
iface.launch() |