from dotenv import find_dotenv, load_dotenv # get the API keys from transformers import pipeline # download huggingface model to our machine from langchain_core.prompts import PromptTemplate from langchain_community.chat_models import ChatOpenAI from langchain.chains import LLMChain import requests import os import streamlit as st load_dotenv(find_dotenv()) HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # img2text def img2text(url): image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") text = image_to_text(url)[0]["generated_text"] print(text) return text # llm def generate_story(scenario): # template to generate a story template = """ You are a story teller; You can generate a short story based on a single narrative, the story should be no more than 20 words; CONTEXT: {scenario} STORY: """ prompt = PromptTemplate(template=template, input_variables=["scenario"]) # llm chain story_llm = LLMChain(llm=ChatOpenAI( model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True) story = story_llm.predict(scenario=scenario) print(story) return story # text to speech def text2speech(message): API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits" headers = {"Authorization": f"Bearer {HUGGINGFACEHUB_API_TOKEN}"} payloads = {"inputs": message} response = requests.post(API_URL, headers=headers, json=payloads) with open("audio.wav", 'wb') as file: # for me .wav worked instead of .flac file.write(response.content) # scenario = img2text("photo.jpg") # story = generate_story(scenario) # text2speech(story) # main function for UI layer def main(): st.set_page_config(page_title="Image 2 Audio Story", page_icon="🩵") st.header("Turn image into a short audio story") uploaded_file = st.file_uploader("Choose an image...", type="jpg") if uploaded_file is not None: print(uploaded_file) bytes_data = uploaded_file.getvalue() with open(uploaded_file.name, "wb") as file: file.write(bytes_data) st.image(uploaded_file, caption="Uploaded Image.", use_container_width=True) scenario = img2text(uploaded_file.name) story = generate_story(scenario) text2speech(story) with st.expander("scenario"): st.write(scenario) with st.expander("story"): st.write(story) st.audio("audio.wav") if __name__ == '__main__': main()