Spaces:
Runtime error
Runtime error
tarrasyed19472007
commited on
Commit
•
a713626
1
Parent(s):
7ef5343
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sentence_transformers import SentenceTransformer
|
3 |
+
import faiss
|
4 |
+
import numpy as np
|
5 |
+
import streamlit as st
|
6 |
+
|
7 |
+
# Load CSV data using a relative path
|
8 |
+
csv_file = "Hydra-Movie-Scrape.csv" # Ensure this file is in the same directory as app.py
|
9 |
+
df = pd.read_csv(csv_file)
|
10 |
+
|
11 |
+
# Use 'Summary' or 'Short Summary' as the source for documents
|
12 |
+
# Fill NaNs with "No summary available."
|
13 |
+
df['Summary'] = df['Summary'].fillna("No summary available.")
|
14 |
+
documents = df['Summary'].tolist() # Use 'Summary' for document embeddings
|
15 |
+
|
16 |
+
# Initialize the SentenceTransformer model
|
17 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
|
19 |
+
# Create and cache embeddings
|
20 |
+
@st.cache_resource
|
21 |
+
def create_embeddings(documents):
|
22 |
+
embeddings = model.encode(documents, show_progress_bar=True)
|
23 |
+
return embeddings
|
24 |
+
|
25 |
+
# Generate and cache embeddings
|
26 |
+
doc_embeddings = create_embeddings(documents)
|
27 |
+
|
28 |
+
# Convert to NumPy array (FAISS requires float32)
|
29 |
+
embedding_matrix = np.array(doc_embeddings).astype("float32")
|
30 |
+
|
31 |
+
# Build FAISS index for efficient similarity search
|
32 |
+
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
|
33 |
+
index.add(embedding_matrix)
|
34 |
+
|
35 |
+
# Function to retrieve the most relevant documents
|
36 |
+
def retrieve(query, top_k=10): # Retrieve up to 10 movies
|
37 |
+
query_embedding = model.encode(query) # Encode the query
|
38 |
+
query_vector = np.array(query_embedding).astype("float32")
|
39 |
+
distances, indices = index.search(np.array([query_vector]), top_k)
|
40 |
+
return indices[0]
|
41 |
+
|
42 |
+
# Streamlit application layout
|
43 |
+
st.title("Movie Dataset RAG Application")
|
44 |
+
query = st.text_input("Ask a question about movies:")
|
45 |
+
if st.button("Submit"):
|
46 |
+
if query:
|
47 |
+
indices = retrieve(query)
|
48 |
+
|
49 |
+
# Prepare and display detailed responses
|
50 |
+
response = ""
|
51 |
+
for idx in indices:
|
52 |
+
if idx != -1: # Check if the index is valid
|
53 |
+
movie_details = df.iloc[idx]
|
54 |
+
response += f"**Title**: {movie_details['Title']}\n"
|
55 |
+
response += f"**Year**: {movie_details['Year']}\n"
|
56 |
+
response += f"**Director**: {movie_details['Director']}\n"
|
57 |
+
response += f"**Cast**: {movie_details['Cast']}\n"
|
58 |
+
response += f"**Summary**: {movie_details['Summary']}\n\n"
|
59 |
+
|
60 |
+
# Formatting the output with clearer separation
|
61 |
+
if response:
|
62 |
+
st.write("Here are some movies that match your query:\n")
|
63 |
+
st.markdown(response) # Use markdown to format the output nicely
|
64 |
+
else:
|
65 |
+
st.write("No relevant documents found.")
|
66 |
+
else:
|
67 |
+
st.write("Please enter a query.")
|