# import packages import gradio as gr import copy from llama_cpp import Llama from huggingface_hub import hf_hub_download import chromadb from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction from chromadb.utils.data_loaders import ImageLoader from chromadb.config import Settings from datasets import load_dataset import numpy as np from tqdm import tqdm import shutil import os from chromadb.utils import embedding_functions import gradio as gr from PIL import Image import requests from io import BytesIO from transformers import pipeline from bark import SAMPLE_RATE, generate_audio, preload_models import json # Initialize the Llama model llm = Llama( ## original model # model_path=hf_hub_download( # repo_id="microsoft/Phi-3-mini-4k-instruct-gguf", # filename="Phi-3-mini-4k-instruct-q4.gguf", # ), ## compressed model model_path=hf_hub_download( repo_id="TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF", filename="capybarahermes-2.5-mistral-7b.Q2_K.gguf", ), n_ctx=2048, n_gpu_layers=50, # Adjust based on your VRAM ) # use of clip model for embedding client = chromadb.PersistentClient(path="DB") embedding_function = OpenCLIPEmbeddingFunction() image_loader = ImageLoader() # must be if you reads from URIs # initialize separate collection for image and text data def create_collection(name_image_collection,name_text_collection): collection_images = client.create_collection( name=name_image_collection, embedding_function=embedding_function, data_loader=image_loader) collection_text = client.create_collection( name=name_text_collection, embedding_function=embedding_function, ) return collection_images, collection_text collection_images,collection_text = create_collection(name_image_collection = "collection_images",name_text_collection = "collection_text") # Get the uris to the images IMAGE_FOLDER = 'images' image_uris = sorted([os.path.join(IMAGE_FOLDER, image_name) for image_name in os.listdir(IMAGE_FOLDER) if not image_name.endswith('.txt')]) ids = [str(i) for i in range(len(image_uris))] collection_images.add(ids=ids, uris=image_uris) # adding text collections default_ef = embedding_functions.DefaultEmbeddingFunction() TEXT_FOLDER = "text" text_pth = sorted([os.path.join(TEXT_FOLDER, image_name) for image_name in os.listdir(TEXT_FOLDER) if image_name.endswith('.txt')]) list_of_text = [] for text in text_pth: with open(text, 'r') as f: text = f.read() list_of_text.append(text) ids_txt_list = ['id'+str(i) for i in range(len(list_of_text))] collection_text.add( documents = list_of_text, ids =ids_txt_list ) # Initialize the transcriber transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") # Preload TTS models preload_models() image_path = "dom_bremen.jpg" absolute_path = os.path.abspath(image_path) def transcribe(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) return transcriber({"sampling_rate": sr, "raw": y})["text"] fixed_prompt = "en_speaker_5" def generate_audio_output(text): audio_arr = generate_audio(text, history_prompt=fixed_prompt) audio_arr = (audio_arr * 32767).astype(np.int16) return (SAMPLE_RATE, audio_arr) # Function to retrieve and generate text based on input query def generate_text(message, max_tokens=150, temperature=0.2, top_p=0.9): try: # Retrieve context and image from vector store retrieved_image = collection_images.query(query_texts=message, include=['data'], n_results=1) context_text = collection_text.query(query_texts=message, n_results=1) context = context_text['documents'][0] if context_text else "No relevant context found." image_data = retrieved_image['uris'][0] if retrieved_image else None image_url = image_data if image_data else None # Log the image URL for debugging print(f"Retrieved image URL: {image_url}") # Create prompt template for LLM prompt_template = ( f"Context: {context}\n\n" f"Question: {message}\n\n" f"You are a guide to city of Bremen from Germany, generate response based on context." ) # Generate text using the language model output = llm( prompt_template, temperature=temperature, top_p=top_p, top_k=50, repeat_penalty=1.1, max_tokens=max_tokens, ) # Process the output input_string = output['choices'][0]['text'].strip() cleaned_text = input_string.strip("[]'").replace('\\n', '\n') continuous_text = '\n'.join(cleaned_text.split('\n')) return continuous_text, image_url[0] except Exception as e: return f"Error: {str(e)}", None # Function to load and display an image from a file path def load_image_from_path(file_path): try: img = Image.open(file_path) return img except Exception as e: print(f"Error loading image: {str(e)}") return None def process_audio(audio): # Transcribe the audio # transcribed_text = transcribe(audio) message = "Bremen Schnoor" text_output, image_path = generate_text(message) if image_path: image_output = load_image_from_path(image_path) else: image_output = None # Handle cases where no image is retrieved # return text_output, image_output # Generate audio output audio_output = generate_audio_output(text_output) return text_output,audio_output,image_output def gen_tts(text): audio_arr = generate_audio(text, history_prompt=fixed_prompt) audio_arr = (audio_arr * 32767).astype(np.int16) return (SAMPLE_RATE, audio_arr) # Define the Gradio interface # with gr.Blocks() as app: demo = gr.Interface( fn=process_audio, inputs=gr.Audio(sources=["microphone"], label="Input Audio"), outputs=[ gr.Textbox(label="Generated Text"), gr.Audio(label="Generated Audio"), gr.Image(label="Retrieved Image") # New output component for the image ], title="moinBremen - Your Personal Tour Guide for our City of Bremen", description="Ask your question about Bremen by speaking into the microphone. The system will transcribe your question, generate a response, and read it out loud.", css=""".gradio-container { background: url('file=/content/dom_bremen.jpg') no-repeat center center fixed; background-size: cover; }""", cache_examples=False, ) demo.launch(allowed_paths=[absolute_path])