import pandas as pd from sentence_transformers import SentenceTransformer import faiss import numpy as np import streamlit as st # Load CSV data using a relative path csv_file = "Hydra-Movie-Scrape.csv" # Ensure this file is in the same directory as app.py df = pd.read_csv(csv_file) # Use 'Summary' or 'Short Summary' as the source for documents # Fill NaNs with "No summary available." df['Summary'] = df['Summary'].fillna("No summary available.") documents = df['Summary'].tolist() # Use 'Summary' for document embeddings # Initialize the SentenceTransformer model model = SentenceTransformer('all-MiniLM-L6-v2') # Create and cache embeddings @st.cache_resource def create_embeddings(documents): embeddings = model.encode(documents, show_progress_bar=True) return embeddings # Generate and cache embeddings doc_embeddings = create_embeddings(documents) # Convert to NumPy array (FAISS requires float32) embedding_matrix = np.array(doc_embeddings).astype("float32") # Build FAISS index for efficient similarity search index = faiss.IndexFlatL2(embedding_matrix.shape[1]) index.add(embedding_matrix) # Function to retrieve the most relevant documents def retrieve(query, top_k=10): # Retrieve up to 10 movies query_embedding = model.encode(query) # Encode the query query_vector = np.array(query_embedding).astype("float32") distances, indices = index.search(np.array([query_vector]), top_k) return indices[0] # Streamlit application layout st.title("Movie Dataset RAG Application") query = st.text_input("Ask a question about movies:") if st.button("Submit"): if query: indices = retrieve(query) # Prepare and display detailed responses response = "" for idx in indices: if idx != -1: # Check if the index is valid movie_details = df.iloc[idx] response += f"**Title**: {movie_details['Title']}\n" response += f"**Year**: {movie_details['Year']}\n" response += f"**Director**: {movie_details['Director']}\n" response += f"**Cast**: {movie_details['Cast']}\n" response += f"**Summary**: {movie_details['Summary']}\n\n" # Formatting the output with clearer separation if response: st.write("Here are some movies that match your query:\n") st.markdown(response) # Use markdown to format the output nicely else: st.write("No relevant documents found.") else: st.write("Please enter a query.")