Spaces:
Running
Running
import gradio as gr | |
from sentence_transformers import SentenceTransformer | |
import duckdb | |
from huggingface_hub import get_token | |
from sentence_transformers import SentenceTransformer | |
from sentence_transformers.models import StaticEmbedding | |
import duckdb | |
# Initialize a StaticEmbedding module | |
static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M") | |
model = SentenceTransformer(modules=[static_embedding]) | |
dataset_name = "smol-blueprint/fineweb-bbc-news-text-embeddings" | |
embedding_column = "embedding" | |
duckdb.sql( | |
query=f""" | |
INSTALL vss; | |
LOAD vss; | |
CREATE TABLE embeddings AS | |
SELECT *, {embedding_column}::float[{model.get_sentence_embedding_dimension()}] as embedding_float | |
FROM 'hf://datasets/{dataset_name}/**/*.parquet'; | |
CREATE INDEX my_hnsw_index ON embeddings USING HNSW (embedding_float) WITH (metric = 'cosine'); | |
""" | |
) | |
def similarity_search(query: str, k: int = 5): | |
embedding = model.encode(query).tolist() | |
return duckdb.sql( | |
query=f""" | |
SELECT url, chunk, array_cosine_distance(embedding_float, {embedding}::FLOAT[{model.get_sentence_embedding_dimension()}]) as distance | |
FROM embeddings | |
ORDER BY distance | |
LIMIT {k}; | |
""" | |
).to_df() | |
with gr.Blocks() as demo: | |
gr.Markdown("""# Vector Search Hub Datasets | |
Part of [smol blueprint](https://github.com/huggingface/smol-blueprint) - a smol blueprint for AI development, focusing on applied examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs. """) | |
query = gr.Textbox(label="Query") | |
k = gr.Slider(1, 50, value=5, label="Number of results") | |
btn = gr.Button("Search") | |
results = gr.Dataframe(headers=["url", "chunk", "distance"]) | |
btn.click(fn=similarity_search, inputs=[query, k], outputs=[results]) | |
demo.launch() |