File size: 1,747 Bytes
5dc52fb
 
 
 
 
 
8636daf
 
 
0ca033d
8636daf
 
0636ee3
8636daf
0636ee3
8636daf
 
0636ee3
8636daf
 
0636ee3
 
8636daf
 
 
 
 
0636ee3
 
8636daf
 
5dc52fb
8636daf
5dc52fb
 
e0e0b77
 
d593d0c
5dc52fb
53ff92d
5dc52fb
8636daf
5dc52fb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr

from sentence_transformers import SentenceTransformer
import duckdb
from huggingface_hub import get_token

from sentence_transformers import SentenceTransformer
import duckdb

model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1")
dataset_name = "smol-blueprint/fineweb-bbc-news-text-embeddings"
embedding_column = "embedding"
table_name = "fineweb"

duckdb.sql(query=f"""
    INSTALL vss;
    LOAD vss;
    CREATE TABLE {table_name} AS 
    SELECT *, {embedding_column}::float[{model.get_sentence_embedding_dimension()}] as embedding_float 
    FROM 'hf://datasets/{dataset_name}/**/*.parquet';
    CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
""")

def similarity_search(query: str, k: int = 5):
    embedding = model.encode(query).tolist()
    return duckdb.sql(
        query=f"""
        SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{model.get_sentence_embedding_dimension()}]) as distance 
        FROM {table_name}
        ORDER BY distance 
        LIMIT {k};
    """
    ).to_df()

with gr.Blocks() as demo:
    gr.Markdown("""# Vector Search Hub Datasets
                
                Part of [smol blueprint](https://github.com/huggingface/smol-blueprint) - a smol blueprint for AI development, focusing on applied examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
    query = gr.Textbox(label="Query")
    k = gr.Slider(1, 50, value=5, label="Number of results")
    btn = gr.Button("Search")
    results = gr.Dataframe(headers=["url", "chunk", "distance"])
    btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
    

demo.launch()