tarrasyed19472007 commited on
Commit
a713626
1 Parent(s): 7ef5343

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer
3
+ import faiss
4
+ import numpy as np
5
+ import streamlit as st
6
+
7
+ # Load CSV data using a relative path
8
+ csv_file = "Hydra-Movie-Scrape.csv" # Ensure this file is in the same directory as app.py
9
+ df = pd.read_csv(csv_file)
10
+
11
+ # Use 'Summary' or 'Short Summary' as the source for documents
12
+ # Fill NaNs with "No summary available."
13
+ df['Summary'] = df['Summary'].fillna("No summary available.")
14
+ documents = df['Summary'].tolist() # Use 'Summary' for document embeddings
15
+
16
+ # Initialize the SentenceTransformer model
17
+ model = SentenceTransformer('all-MiniLM-L6-v2')
18
+
19
+ # Create and cache embeddings
20
+ @st.cache_resource
21
+ def create_embeddings(documents):
22
+ embeddings = model.encode(documents, show_progress_bar=True)
23
+ return embeddings
24
+
25
+ # Generate and cache embeddings
26
+ doc_embeddings = create_embeddings(documents)
27
+
28
+ # Convert to NumPy array (FAISS requires float32)
29
+ embedding_matrix = np.array(doc_embeddings).astype("float32")
30
+
31
+ # Build FAISS index for efficient similarity search
32
+ index = faiss.IndexFlatL2(embedding_matrix.shape[1])
33
+ index.add(embedding_matrix)
34
+
35
+ # Function to retrieve the most relevant documents
36
+ def retrieve(query, top_k=10): # Retrieve up to 10 movies
37
+ query_embedding = model.encode(query) # Encode the query
38
+ query_vector = np.array(query_embedding).astype("float32")
39
+ distances, indices = index.search(np.array([query_vector]), top_k)
40
+ return indices[0]
41
+
42
+ # Streamlit application layout
43
+ st.title("Movie Dataset RAG Application")
44
+ query = st.text_input("Ask a question about movies:")
45
+ if st.button("Submit"):
46
+ if query:
47
+ indices = retrieve(query)
48
+
49
+ # Prepare and display detailed responses
50
+ response = ""
51
+ for idx in indices:
52
+ if idx != -1: # Check if the index is valid
53
+ movie_details = df.iloc[idx]
54
+ response += f"**Title**: {movie_details['Title']}\n"
55
+ response += f"**Year**: {movie_details['Year']}\n"
56
+ response += f"**Director**: {movie_details['Director']}\n"
57
+ response += f"**Cast**: {movie_details['Cast']}\n"
58
+ response += f"**Summary**: {movie_details['Summary']}\n\n"
59
+
60
+ # Formatting the output with clearer separation
61
+ if response:
62
+ st.write("Here are some movies that match your query:\n")
63
+ st.markdown(response) # Use markdown to format the output nicely
64
+ else:
65
+ st.write("No relevant documents found.")
66
+ else:
67
+ st.write("Please enter a query.")