vector_search / app.py
Michelangiolo's picture
dual function
a6e68b0
import os
os.system('pip install openpyxl')
os.system('pip install scikit-learn')
os.system('pip install sentence-transformers')
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2
df = pd.read_parquet('df.parquet')
df2 = pd.read_parquet('df2.parquet')
df3 = pd.read_parquet('df3.parquet')
#prepare model
nbrs1 = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df2['text_vector_'].values.tolist())
nbrs2 = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df3['text_vector_'].values.tolist())
def search1(query, nbrs, full_df, cleaned_df):
product = model.encode(query).tolist()
# product = df.iloc[0]['text_vector_'] #use one of the products as sample
distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object
#print out the description of every recommended product
output = cleaned_df.iloc[list(indices)[0]][['text']]
full_text = full_df.loc[range(output.index[0]-1, output.index[0]+2)]['text'].values.tolist()
return '\n\n'.join(full_text)
def search_sentences(df):
df2['text'].str.split('.', expand=True).stack().reset_index(level=1, drop=True).rename('B').reset_index(drop=True)[0:50]
output = search1('how to speed up data movement', nbrs=nbrs1, full_df=df, cleaned_df=df2)
output
import gradio as gr
import os
#the first module becomes text1, the second module file1
def greet(type, text1):
if type == "sentence":
return search1(text1, nbrs2, df3, df3)
elif type == "paragraph":
return search1(text1, nbrs1, df, df2)
iface = gr.Interface(
fn=greet,
inputs=[
gr.Radio(["sentence", "paragraph"]),
gr.Textbox(label="text")
],
outputs=["text"]
)
iface.launch(share=False)