ahmad4raza commited on
Commit
8acab6d
·
1 Parent(s): bab4993

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +91 -0
main.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
2
+ from langchain import PromptTemplate, LLMChain, OpenAI
3
+ import requests
4
+ import os
5
+ import io
6
+ from datasets import load_dataset
7
+ import torch
8
+ import soundfile as sf
9
+ import gradio as gr
10
+ from PIL import Image
11
+ import numpy as np
12
+
13
+ from dotenv import load_dotenv, find_dotenv
14
+ load_dotenv(find_dotenv())
15
+
16
+ def handwriting_to_text(url):
17
+ model_1 = pipeline("image-to-text", model="microsoft/trocr-base-handwritten")
18
+ output_1 = model_1(url)
19
+ return output_1
20
+
21
+ def generate_story(scenario):
22
+ template = """
23
+ Consider yourself as the famous poet "William Shakespere";
24
+ You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;
25
+
26
+ CONTEXT: {scenario}
27
+ POEM:
28
+ """
29
+
30
+ prompt = PromptTemplate(template=template, input_variables=["scenario"])
31
+
32
+ story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
33
+ story = story_llm.predict(scenario=scenario)
34
+
35
+ print(story)
36
+ return story
37
+
38
+ def recite_the_poem(content):
39
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
40
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
41
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
42
+
43
+ inputs = processor(text=content, return_tensors="pt")
44
+
45
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
46
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
47
+
48
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
49
+
50
+ sf.write("speech.wav", speech.numpy(), samplerate=16000)
51
+
52
+ return "speech.wav"
53
+
54
+ def recite_the_poem(content):
55
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
56
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
57
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
58
+
59
+ inputs = processor(text=content, return_tensors="pt")
60
+
61
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
62
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
63
+
64
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
65
+
66
+ sf.write("speech.wav", speech.numpy(), samplerate=16000)
67
+
68
+ with open("speech.wav", "rb") as audio_file:
69
+ audio_data = audio_file.read()
70
+
71
+ return audio_data
72
+
73
+ def main_model(image):
74
+ image = Image.fromarray(np.uint8(image))
75
+ image_path = "temp_image.png"
76
+ image.save(image_path)
77
+ text = handwriting_to_text(image_path)
78
+ poem = generate_story(text)
79
+ audio_data = recite_the_poem(poem)
80
+ return poem, audio_data
81
+
82
+ iface = gr.Interface(
83
+ fn=main_model,
84
+ inputs="image",
85
+ outputs=["text", "audio"],
86
+ title="Handwriting to Shakespearean Poem",
87
+ description="Upload an image containing handwritten text to generate a Shakespearean poem and play the recited poem.",
88
+ )
89
+
90
+ if __name__ == "__main__":
91
+ iface.launch(share=True)