import chromadb from sentence_transformers import CrossEncoder, SentenceTransformer import json def chroma_client_setup(): chroma_client = chromadb.Client() collection = chroma_client.create_collection( name="food_collection", metadata={"hnsw:space": "cosine"} # l2 is the default ) return collection def load_data(): with open("test_json.json", "r") as f: data = json.load(f) return data def embedding_function(items_to_embed: list[str]): sentence_model = SentenceTransformer( "mixedbread-ai/mxbai-embed-large-v1" ) embedded_items = sentence_model.encode( items_to_embed ) return embedded_items def chroma_upserting(collection, payload:list[dict]): for idx, item in enumerate(payload): print("printing item:") print(type(item)) embedding = list(embedding_function(item['doc'])) print(type(embedding)) collection.add( documents=item['doc'], embeddings=embedding, #metadatas=item, ids=f"id_{idx}" ) def search_chroma(collection, query:str): results = collection.query( query_embeddings=embedding_function([query]), n_results=5 ) return results def reranking_results(query: str, top_k_results: list[str]): # Load the model, here we use our base sized model rerank_model = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1") reranked_results = rerank_model.rank(query, top_k_results, return_documents=True) return reranked_results