davanstrien HF staff commited on
Commit
93f267c
1 Parent(s): 52369d8

update embedding model

Browse files
Files changed (1) hide show
  1. load_data.py +7 -4
load_data.py CHANGED
@@ -24,10 +24,10 @@ load_dotenv()
24
 
25
 
26
  HF_TOKEN = os.getenv("HF_TOKEN")
27
- EMBEDDING_MODEL_NAME = "Snowflake/snowflake-arctic-embed-m-long"
28
- EMBEDDING_MODEL_REVISION = "ac9d0cb43661ee1f7d67b3aa63614d65a6c86463"
29
  INFERENCE_MODEL_URL = (
30
- "https://pqzap00ebpl1ydt4.us-east-1.aws.endpoints.huggingface.cloud"
31
  )
32
  DATASET_PARQUET_URL = (
33
  "hf://datasets/librarian-bots/dataset_cards_with_metadata/data/train-*.parquet"
@@ -62,7 +62,10 @@ def get_embedding_function():
62
  def get_collection(chroma_client, embedding_function):
63
  logger.info(f"Getting or creating collection: {COLLECTION_NAME}")
64
  return chroma_client.create_collection(
65
- name=COLLECTION_NAME, get_or_create=True, embedding_function=embedding_function
 
 
 
66
  )
67
 
68
 
 
24
 
25
 
26
  HF_TOKEN = os.getenv("HF_TOKEN")
27
+ EMBEDDING_MODEL_NAME = "Alibaba-NLP/gte-large-en-v1.5"
28
+ EMBEDDING_MODEL_REVISION = "104333d6af6f97649377c2afbde10a7704870c7b"
29
  INFERENCE_MODEL_URL = (
30
+ "https://spwy1g6626yhjhpr.us-east-1.aws.endpoints.huggingface.cloud"
31
  )
32
  DATASET_PARQUET_URL = (
33
  "hf://datasets/librarian-bots/dataset_cards_with_metadata/data/train-*.parquet"
 
62
  def get_collection(chroma_client, embedding_function):
63
  logger.info(f"Getting or creating collection: {COLLECTION_NAME}")
64
  return chroma_client.create_collection(
65
+ name=COLLECTION_NAME,
66
+ get_or_create=True,
67
+ embedding_function=embedding_function,
68
+ metadata={"hnsw:space": "cosine"},
69
  )
70
 
71