Spaces:

Michelangiolo
/

vector_search

Sleeping

File size: 1,910 Bytes

import os
os.system('pip install openpyxl')
os.system('pip install scikit-learn')
os.system('pip install sentence-transformers')

from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2') #all-MiniLM-L6-v2 #all-mpnet-base-v2

df = pd.read_parquet('df.parquet')
df2 = pd.read_parquet('df2.parquet')
df3 = pd.read_parquet('df3.parquet')

#prepare model
nbrs1 = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df2['text_vector_'].values.tolist())
nbrs2 = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(df3['text_vector_'].values.tolist())

def search1(query, nbrs, full_df, cleaned_df):
    product = model.encode(query).tolist()
    # product = df.iloc[0]['text_vector_'] #use one of the products as sample

    distances, indices = nbrs.kneighbors([product]) #input the vector of the reference object

    #print out the description of every recommended product
    output = cleaned_df.iloc[list(indices)[0]][['text']]
    full_text =  full_df.loc[range(output.index[0]-1, output.index[0]+2)]['text'].values.tolist()
    return '\n\n'.join(full_text)

def search_sentences(df):
    df2['text'].str.split('.', expand=True).stack().reset_index(level=1, drop=True).rename('B').reset_index(drop=True)[0:50]

output = search1('how to speed up data movement', nbrs=nbrs1, full_df=df, cleaned_df=df2)
output

import gradio as gr
import os

#the first module becomes text1, the second module file1
def greet(type, text1): 
    if type == "sentence":
        return search1(text1, nbrs2, df3, df3)
    elif type == "paragraph":
        return search1(text1, nbrs1, df, df2)

iface = gr.Interface(
    fn=greet, 
    inputs=[
        gr.Radio(["sentence", "paragraph"]),
        gr.Textbox(label="text")
        ],  
    outputs=["text"]
)
iface.launch(share=False)