File size: 2,590 Bytes
8acab6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18a1106
 
942040a
18a1106
 
 
 
8acab6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94dcd08
f0b4393
8acab6d
 
 
6b9ee95
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain import PromptTemplate, LLMChain, OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

def handwriting_to_text(image):
    API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten"
    headers = {"Authorization": "Bearer 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}
    with open(image, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

def generate_story(scenario):
    template = """
    Consider yourself as the famous poet "William Shakespere";    
    You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;

    CONTEXT: {scenario}
    POEM:
    """

    prompt = PromptTemplate(template=template, input_variables=["scenario"])

    story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
    story = story_llm.predict(scenario=scenario)

    print(story)
    return story

def recite_the_poem(content):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    inputs = processor(text=content, return_tensors="pt")

    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    sf.write("speech.wav", speech.numpy(), samplerate=16000)

    with open("speech.wav", "rb") as audio_file:
        audio_data = audio_file.read()
    
    return audio_data

def main_model(image):
    image = Image.fromarray(np.uint8(image))
    image_path = "temp_image.png"
    image.save(image_path)
    text = handwriting_to_text(image_path)
    poem = generate_story(text)
    audio_data = recite_the_poem(poem)
    return poem, audio_data

iface = gr.Interface(
    fn=main_model,
    inputs="image",
    outputs=["text", "audio"],
    title="Flying Shakespeare",
    description="Upload the image generated from the Model:O101-M101/2",
)

if __name__ == "__main__":
    iface.launch()