vsanchezn commited on
Commit
82e3a35
·
verified ·
1 Parent(s): 7fef7c2

app and requirements uploaded

Browse files
Files changed (2) hide show
  1. app.py +92 -0
  2. requirements.txt +95 -0
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import find_dotenv, load_dotenv # get the API keys
2
+ from transformers import pipeline # download huggingface model to our machine
3
+ from langchain_core.prompts import PromptTemplate
4
+ from langchain_community.chat_models import ChatOpenAI
5
+ from langchain.chains import LLMChain
6
+ import requests
7
+ import os
8
+ import streamlit as st
9
+
10
+ load_dotenv(find_dotenv())
11
+ HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
12
+
13
+ # img2text
14
+
15
+ def img2text(url):
16
+ image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
17
+
18
+ text = image_to_text(url)[0]["generated_text"]
19
+
20
+ print(text)
21
+ return text
22
+
23
+
24
+ # llm
25
+ def generate_story(scenario):
26
+ # template to generate a story
27
+ template = """
28
+ You are a story teller;
29
+ You can generate a short story based on a single narrative, the story should be no more than 20 words;
30
+
31
+ CONTEXT: {scenario}
32
+ STORY:
33
+ """
34
+ prompt = PromptTemplate(template=template, input_variables=["scenario"])
35
+
36
+ # llm chain
37
+ story_llm = LLMChain(llm=ChatOpenAI(
38
+ model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
39
+
40
+ story = story_llm.predict(scenario=scenario)
41
+
42
+
43
+ print(story)
44
+ return story
45
+
46
+
47
+ # text to speech
48
+ def text2speech(message):
49
+ API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
50
+ headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
51
+ payloads = {"inputs": message}
52
+ response = requests.post(API_URL, headers=headers, json=payloads)
53
+
54
+ with open("audio.wav", 'wb') as file: # for me .wav worked instead of .flac
55
+ file.write(response.content)
56
+
57
+ # scenario = img2text("photo.jpg")
58
+ # story = generate_story(scenario)
59
+ # text2speech(story)
60
+
61
+
62
+
63
+
64
+
65
+ # main function for UI layer
66
+ def main():
67
+ st.set_page_config(page_title="Image 2 Audio Story", page_icon="🩵")
68
+
69
+ st.header("Turn image into a short audio story")
70
+ uploaded_file = st.file_uploader("Choose an image...", type="jpg")
71
+
72
+ if uploaded_file is not None:
73
+ print(uploaded_file)
74
+ bytes_data = uploaded_file.getvalue()
75
+ with open(uploaded_file.name, "wb") as file:
76
+ file.write(bytes_data)
77
+ st.image(uploaded_file, caption="Uploaded Image.",
78
+ use_container_width=True)
79
+
80
+ scenario = img2text(uploaded_file.name)
81
+ story = generate_story(scenario)
82
+ text2speech(story)
83
+
84
+ with st.expander("scenario"):
85
+ st.write(scenario)
86
+ with st.expander("story"):
87
+ st.write(story)
88
+
89
+ st.audio("audio.wav")
90
+
91
+ if __name__ == '__main__':
92
+ main()
requirements.txt ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.4
2
+ aiohttp==3.11.9
3
+ aiosignal==1.3.1
4
+ altair==5.5.0
5
+ annotated-types==0.7.0
6
+ anyio==4.6.2.post1
7
+ async-timeout==4.0.3
8
+ attrs==24.2.0
9
+ blinker==1.9.0
10
+ cachetools==5.5.0
11
+ certifi==2024.8.30
12
+ charset-normalizer==3.4.0
13
+ click==8.1.7
14
+ dataclasses-json==0.6.7
15
+ distro==1.9.0
16
+ exceptiongroup==1.2.2
17
+ filelock==3.16.1
18
+ frozenlist==1.5.0
19
+ fsspec==2024.10.0
20
+ gitdb==4.0.11
21
+ GitPython==3.1.43
22
+ h11==0.14.0
23
+ httpcore==1.0.7
24
+ httpx==0.28.0
25
+ httpx-sse==0.4.0
26
+ huggingface-hub==0.26.3
27
+ idna==3.10
28
+ Jinja2==3.1.4
29
+ jiter==0.8.0
30
+ jsonpatch==1.33
31
+ jsonpointer==3.0.0
32
+ jsonschema==4.23.0
33
+ jsonschema-specifications==2024.10.1
34
+ langchain==0.3.9
35
+ langchain-community==0.3.9
36
+ langchain-core==0.3.21
37
+ langchain-openai==0.2.11
38
+ langchain-text-splitters==0.3.2
39
+ langsmith==0.1.147
40
+ markdown-it-py==3.0.0
41
+ MarkupSafe==3.0.2
42
+ marshmallow==3.23.1
43
+ mdurl==0.1.2
44
+ mpmath==1.3.0
45
+ multidict==6.1.0
46
+ mypy-extensions==1.0.0
47
+ narwhals==1.15.2
48
+ networkx==3.4.2
49
+ numpy==1.26.4
50
+ openai==1.56.2
51
+ orjson==3.10.12
52
+ packaging==24.2
53
+ pandas==2.2.3
54
+ pillow==11.0.0
55
+ propcache==0.2.1
56
+ protobuf==5.29.1
57
+ pyarrow==18.1.0
58
+ pydantic==2.10.3
59
+ pydantic-settings==2.6.1
60
+ pydantic_core==2.27.1
61
+ pydeck==0.9.1
62
+ pydub==0.25.1
63
+ Pygments==2.18.0
64
+ python-dateutil==2.9.0.post0
65
+ python-dotenv==1.0.1
66
+ pytz==2024.2
67
+ PyYAML==6.0.2
68
+ referencing==0.35.1
69
+ regex==2024.11.6
70
+ requests==2.32.3
71
+ requests-toolbelt==1.0.0
72
+ rich==13.9.4
73
+ rpds-py==0.22.3
74
+ safetensors==0.4.5
75
+ six==1.17.0
76
+ smmap==5.0.1
77
+ sniffio==1.3.1
78
+ SQLAlchemy==2.0.36
79
+ streamlit==1.40.2
80
+ sympy==1.13.1
81
+ tenacity==9.0.0
82
+ tiktoken==0.8.0
83
+ tokenizers==0.20.3
84
+ toml==0.10.2
85
+ torch==2.5.1
86
+ torchaudio==2.5.1
87
+ torchvision==0.20.1
88
+ tornado==6.4.2
89
+ tqdm==4.67.1
90
+ transformers==4.46.3
91
+ typing-inspect==0.9.0
92
+ typing_extensions==4.12.2
93
+ tzdata==2024.2
94
+ urllib3==2.2.3
95
+ yarl==1.18.3