Spaces:
Running
Running
File size: 1,747 Bytes
5dc52fb 8636daf 0ca033d 8636daf 0636ee3 8636daf 0636ee3 8636daf 0636ee3 8636daf 0636ee3 8636daf 0636ee3 8636daf 5dc52fb 8636daf 5dc52fb e0e0b77 d593d0c 5dc52fb 53ff92d 5dc52fb 8636daf 5dc52fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
import gradio as gr
from sentence_transformers import SentenceTransformer
import duckdb
from huggingface_hub import get_token
from sentence_transformers import SentenceTransformer
import duckdb
model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1")
dataset_name = "smol-blueprint/fineweb-bbc-news-text-embeddings"
embedding_column = "embedding"
table_name = "fineweb"
duckdb.sql(query=f"""
INSTALL vss;
LOAD vss;
CREATE TABLE {table_name} AS
SELECT *, {embedding_column}::float[{model.get_sentence_embedding_dimension()}] as embedding_float
FROM 'hf://datasets/{dataset_name}/**/*.parquet';
CREATE INDEX my_hnsw_index ON {table_name} USING HNSW (embedding_float) WITH (metric = 'cosine');
""")
def similarity_search(query: str, k: int = 5):
embedding = model.encode(query).tolist()
return duckdb.sql(
query=f"""
SELECT chunk, url, array_cosine_distance({embedding_column}_float, {embedding}::FLOAT[{model.get_sentence_embedding_dimension()}]) as distance
FROM {table_name}
ORDER BY distance
LIMIT {k};
"""
).to_df()
with gr.Blocks() as demo:
gr.Markdown("""# Vector Search Hub Datasets
Part of [smol blueprint](https://github.com/huggingface/smol-blueprint) - a smol blueprint for AI development, focusing on applied examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """)
query = gr.Textbox(label="Query")
k = gr.Slider(1, 50, value=5, label="Number of results")
btn = gr.Button("Search")
results = gr.Dataframe(headers=["url", "chunk", "distance"])
btn.click(fn=similarity_search, inputs=[query, k], outputs=[results])
demo.launch() |