Spaces:

vsanchezn
/

Image-2-Text-2-Speech

Sleeping

App Files Files Community

vsanchezn commited on Dec 4, 2024

Commit

82e3a35

verified ·

1 Parent(s): 7fef7c2

app and requirements uploaded

Browse files

Files changed (2) hide show

app.py +92 -0
requirements.txt +95 -0

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from dotenv import find_dotenv, load_dotenv # get the API keys
+from transformers import pipeline # download huggingface model to our machine
+from langchain_core.prompts import PromptTemplate
+from langchain_community.chat_models import ChatOpenAI
+from langchain.chains import LLMChain
+import requests
+import os
+import streamlit as st
+load_dotenv(find_dotenv())
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+# img2text
+def img2text(url):
+    image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
+    text = image_to_text(url)[0]["generated_text"]
+    print(text)
+    return text
+# llm
+def generate_story(scenario):
+    # template to generate a story
+    template = """
+    You are a story teller;
+    You can generate a short story based on a single narrative, the story should be no more than 20 words;
+    CONTEXT: {scenario}
+    STORY:
+    """
+    prompt = PromptTemplate(template=template, input_variables=["scenario"])
+    # llm chain
+    story_llm = LLMChain(llm=ChatOpenAI(
+        model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
+    story = story_llm.predict(scenario=scenario)
+    print(story)
+    return story
+# text to speech
+def text2speech(message):
+    API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
+    headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"}
+    payloads = {"inputs": message}
+    response = requests.post(API_URL, headers=headers, json=payloads)
+    with open("audio.wav", 'wb') as file: # for me .wav worked instead of .flac
+        file.write(response.content)
+# scenario = img2text("photo.jpg")
+# story = generate_story(scenario)
+# text2speech(story)
+# main function for UI layer
+def main():
+    st.set_page_config(page_title="Image 2 Audio Story", page_icon="🩵")
+    st.header("Turn image into a short audio story")
+    uploaded_file = st.file_uploader("Choose an image...", type="jpg")
+    if uploaded_file is not None:
+        print(uploaded_file)
+        bytes_data = uploaded_file.getvalue()
+        with open(uploaded_file.name, "wb") as file:
+            file.write(bytes_data)
+        st.image(uploaded_file, caption="Uploaded Image.",
+                 use_container_width=True)
+        scenario = img2text(uploaded_file.name)
+        story = generate_story(scenario)
+        text2speech(story)
+        with st.expander("scenario"):
+            st.write(scenario)
+        with st.expander("story"):
+            st.write(story)
+        st.audio("audio.wav")
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,95 @@

+aiohappyeyeballs==2.4.4
+aiohttp==3.11.9
+aiosignal==1.3.1
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.6.2.post1
+async-timeout==4.0.3
+attrs==24.2.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+dataclasses-json==0.6.7
+distro==1.9.0
+exceptiongroup==1.2.2
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.10.0
+gitdb==4.0.11
+GitPython==3.1.43
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.0
+httpx-sse==0.4.0
+huggingface-hub==0.26.3
+idna==3.10
+Jinja2==3.1.4
+jiter==0.8.0
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+langchain==0.3.9
+langchain-community==0.3.9
+langchain-core==0.3.21
+langchain-openai==0.2.11
+langchain-text-splitters==0.3.2
+langsmith==0.1.147
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+marshmallow==3.23.1
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.15.2
+networkx==3.4.2
+numpy==1.26.4
+openai==1.56.2
+orjson==3.10.12
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+propcache==0.2.1
+protobuf==5.29.1
+pyarrow==18.1.0
+pydantic==2.10.3
+pydantic-settings==2.6.1
+pydantic_core==2.27.1
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.4
+rpds-py==0.22.3
+safetensors==0.4.5
+six==1.17.0
+smmap==5.0.1
+sniffio==1.3.1
+SQLAlchemy==2.0.36
+streamlit==1.40.2
+sympy==1.13.1
+tenacity==9.0.0
+tiktoken==0.8.0
+tokenizers==0.20.3
+toml==0.10.2
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+tornado==6.4.2
+tqdm==4.67.1
+transformers==4.46.3
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+yarl==1.18.3