Spaces:
Sleeping
Sleeping
app and requirements uploaded
Browse files- app.py +92 -0
- requirements.txt +95 -0
app.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import find_dotenv, load_dotenv # get the API keys
|
2 |
+
from transformers import pipeline # download huggingface model to our machine
|
3 |
+
from langchain_core.prompts import PromptTemplate
|
4 |
+
from langchain_community.chat_models import ChatOpenAI
|
5 |
+
from langchain.chains import LLMChain
|
6 |
+
import requests
|
7 |
+
import os
|
8 |
+
import streamlit as st
|
9 |
+
|
10 |
+
load_dotenv(find_dotenv())
|
11 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
12 |
+
|
13 |
+
# img2text
|
14 |
+
|
15 |
+
def img2text(url):
|
16 |
+
image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
|
17 |
+
|
18 |
+
text = image_to_text(url)[0]["generated_text"]
|
19 |
+
|
20 |
+
print(text)
|
21 |
+
return text
|
22 |
+
|
23 |
+
|
24 |
+
# llm
|
25 |
+
def generate_story(scenario):
|
26 |
+
# template to generate a story
|
27 |
+
template = """
|
28 |
+
You are a story teller;
|
29 |
+
You can generate a short story based on a single narrative, the story should be no more than 20 words;
|
30 |
+
|
31 |
+
CONTEXT: {scenario}
|
32 |
+
STORY:
|
33 |
+
"""
|
34 |
+
prompt = PromptTemplate(template=template, input_variables=["scenario"])
|
35 |
+
|
36 |
+
# llm chain
|
37 |
+
story_llm = LLMChain(llm=ChatOpenAI(
|
38 |
+
model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
|
39 |
+
|
40 |
+
story = story_llm.predict(scenario=scenario)
|
41 |
+
|
42 |
+
|
43 |
+
print(story)
|
44 |
+
return story
|
45 |
+
|
46 |
+
|
47 |
+
# text to speech
|
48 |
+
def text2speech(message):
|
49 |
+
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
|
50 |
+
headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
|
51 |
+
payloads = {"inputs": message}
|
52 |
+
response = requests.post(API_URL, headers=headers, json=payloads)
|
53 |
+
|
54 |
+
with open("audio.wav", 'wb') as file: # for me .wav worked instead of .flac
|
55 |
+
file.write(response.content)
|
56 |
+
|
57 |
+
# scenario = img2text("photo.jpg")
|
58 |
+
# story = generate_story(scenario)
|
59 |
+
# text2speech(story)
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
65 |
+
# main function for UI layer
|
66 |
+
def main():
|
67 |
+
st.set_page_config(page_title="Image 2 Audio Story", page_icon="🩵")
|
68 |
+
|
69 |
+
st.header("Turn image into a short audio story")
|
70 |
+
uploaded_file = st.file_uploader("Choose an image...", type="jpg")
|
71 |
+
|
72 |
+
if uploaded_file is not None:
|
73 |
+
print(uploaded_file)
|
74 |
+
bytes_data = uploaded_file.getvalue()
|
75 |
+
with open(uploaded_file.name, "wb") as file:
|
76 |
+
file.write(bytes_data)
|
77 |
+
st.image(uploaded_file, caption="Uploaded Image.",
|
78 |
+
use_container_width=True)
|
79 |
+
|
80 |
+
scenario = img2text(uploaded_file.name)
|
81 |
+
story = generate_story(scenario)
|
82 |
+
text2speech(story)
|
83 |
+
|
84 |
+
with st.expander("scenario"):
|
85 |
+
st.write(scenario)
|
86 |
+
with st.expander("story"):
|
87 |
+
st.write(story)
|
88 |
+
|
89 |
+
st.audio("audio.wav")
|
90 |
+
|
91 |
+
if __name__ == '__main__':
|
92 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohappyeyeballs==2.4.4
|
2 |
+
aiohttp==3.11.9
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.5.0
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==4.6.2.post1
|
7 |
+
async-timeout==4.0.3
|
8 |
+
attrs==24.2.0
|
9 |
+
blinker==1.9.0
|
10 |
+
cachetools==5.5.0
|
11 |
+
certifi==2024.8.30
|
12 |
+
charset-normalizer==3.4.0
|
13 |
+
click==8.1.7
|
14 |
+
dataclasses-json==0.6.7
|
15 |
+
distro==1.9.0
|
16 |
+
exceptiongroup==1.2.2
|
17 |
+
filelock==3.16.1
|
18 |
+
frozenlist==1.5.0
|
19 |
+
fsspec==2024.10.0
|
20 |
+
gitdb==4.0.11
|
21 |
+
GitPython==3.1.43
|
22 |
+
h11==0.14.0
|
23 |
+
httpcore==1.0.7
|
24 |
+
httpx==0.28.0
|
25 |
+
httpx-sse==0.4.0
|
26 |
+
huggingface-hub==0.26.3
|
27 |
+
idna==3.10
|
28 |
+
Jinja2==3.1.4
|
29 |
+
jiter==0.8.0
|
30 |
+
jsonpatch==1.33
|
31 |
+
jsonpointer==3.0.0
|
32 |
+
jsonschema==4.23.0
|
33 |
+
jsonschema-specifications==2024.10.1
|
34 |
+
langchain==0.3.9
|
35 |
+
langchain-community==0.3.9
|
36 |
+
langchain-core==0.3.21
|
37 |
+
langchain-openai==0.2.11
|
38 |
+
langchain-text-splitters==0.3.2
|
39 |
+
langsmith==0.1.147
|
40 |
+
markdown-it-py==3.0.0
|
41 |
+
MarkupSafe==3.0.2
|
42 |
+
marshmallow==3.23.1
|
43 |
+
mdurl==0.1.2
|
44 |
+
mpmath==1.3.0
|
45 |
+
multidict==6.1.0
|
46 |
+
mypy-extensions==1.0.0
|
47 |
+
narwhals==1.15.2
|
48 |
+
networkx==3.4.2
|
49 |
+
numpy==1.26.4
|
50 |
+
openai==1.56.2
|
51 |
+
orjson==3.10.12
|
52 |
+
packaging==24.2
|
53 |
+
pandas==2.2.3
|
54 |
+
pillow==11.0.0
|
55 |
+
propcache==0.2.1
|
56 |
+
protobuf==5.29.1
|
57 |
+
pyarrow==18.1.0
|
58 |
+
pydantic==2.10.3
|
59 |
+
pydantic-settings==2.6.1
|
60 |
+
pydantic_core==2.27.1
|
61 |
+
pydeck==0.9.1
|
62 |
+
pydub==0.25.1
|
63 |
+
Pygments==2.18.0
|
64 |
+
python-dateutil==2.9.0.post0
|
65 |
+
python-dotenv==1.0.1
|
66 |
+
pytz==2024.2
|
67 |
+
PyYAML==6.0.2
|
68 |
+
referencing==0.35.1
|
69 |
+
regex==2024.11.6
|
70 |
+
requests==2.32.3
|
71 |
+
requests-toolbelt==1.0.0
|
72 |
+
rich==13.9.4
|
73 |
+
rpds-py==0.22.3
|
74 |
+
safetensors==0.4.5
|
75 |
+
six==1.17.0
|
76 |
+
smmap==5.0.1
|
77 |
+
sniffio==1.3.1
|
78 |
+
SQLAlchemy==2.0.36
|
79 |
+
streamlit==1.40.2
|
80 |
+
sympy==1.13.1
|
81 |
+
tenacity==9.0.0
|
82 |
+
tiktoken==0.8.0
|
83 |
+
tokenizers==0.20.3
|
84 |
+
toml==0.10.2
|
85 |
+
torch==2.5.1
|
86 |
+
torchaudio==2.5.1
|
87 |
+
torchvision==0.20.1
|
88 |
+
tornado==6.4.2
|
89 |
+
tqdm==4.67.1
|
90 |
+
transformers==4.46.3
|
91 |
+
typing-inspect==0.9.0
|
92 |
+
typing_extensions==4.12.2
|
93 |
+
tzdata==2024.2
|
94 |
+
urllib3==2.2.3
|
95 |
+
yarl==1.18.3
|