Spaces:

tim-sanders
/

JScholar_RAG_Prototype

Runtime error

App Files Files Community

tim-sanders commited on Jul 22, 2024

Commit

85d4c3b

verified ·

1 Parent(s): e6b4543

Upload folder using huggingface_hub

Browse files

Files changed (42) hide show

.gitattributes +1 -0
.gitignore +10 -0
README.md +3 -9
chat-ui.py +93 -0
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/data_level0.bin +3 -0
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/header.bin +3 -0
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/index_metadata.pickle +3 -0
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/length.bin +3 -0
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/link_lists.bin +3 -0
chroma/chroma.sqlite3 +3 -0
chroma_mgmt_queries.py +34 -0
config/default_config.yaml +8 -0
create_vector_db.py +142 -0
fine_tune_model.py +14 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/8b3219a92973c328a8e22fadcfa821b5dc75636a/added_tokens.json +0 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/added_tokens.json +0 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +1 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/1_Pooling/config.json +7 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/README.md +177 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config.json +24 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config_sentence_transformers.json +7 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/model.safetensors +3 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/modules.json +20 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/sentence_bert_config.json +4 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/special_tokens_map.json +1 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer.json +0 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer_config.json +1 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/vocab.txt +0 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/1_Pooling/config.json +7 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/README.md +177 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config.json +24 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config_sentence_transformers.json +7 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/model.safetensors +3 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/modules.json +20 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/sentence_bert_config.json +4 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/special_tokens_map.json +1 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer.json +0 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer_config.json +1 -0
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/vocab.txt +0 -0
query.py +89 -0
requirements.txt +7 -0
test.py +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+*.xml
+*.iml
+.idea/
+.venv
+.venv2
+./config/default_config.yaml
+./model_cache
+./raw-data
+./chroma
+hf_*/

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: JScholar RAG Prototype
-emoji: 🐢
-colorFrom: red
-colorTo: gray
 sdk: gradio
-sdk_version: 4.39.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: JScholar_RAG_Prototype
+app_file: chat-ui.py
 sdk: gradio
+sdk_version: 4.38.1
 ---

chat-ui.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import gradio as gr
+import yaml
+from langchain.chains.llm import LLMChain
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
+from langchain_community.vectorstores import Chroma
+from langchain_core.prompts import PromptTemplate
+CONFIG_PATH = os.path.join('config', 'default_config.yaml')
+def main():
+    config = load_config()
+    title = "Ask a Johns Hopkins Librarian!"
+    description = """
+    This chat bot is an expert on the <a href=https://jscholarship.library.jhu.edu/handle/1774.2/35703>Edward St. John
+     Real Estate Program - Practicum Projects</a> collection. It will answer any question regarding these papers!
+    <img src="https://jscholarship.library.jhu.edu/assets/j10p/images/libraries.logo.small.horizontal.white.png"
+    width=200px>
+    """
+    article = """
+    This Retrieval Augmented Retrieval (RAG) chat bot is designed to answer questions only on the
+    <a href="https://jscholarship.library.jhu.edu/handle/1774.2/35703">Edward St. John Real Estate Program - Practicum Projects</a> collection
+    """
+    os.environ["HUGGINGFACEHUB_API_TOKEN"] = config['tokens']['hugging_face']
+    gr.Interface(fn=predict,
+                 inputs="text",
+                 title=title,
+                 description=description,
+                 article=article,
+                 examples=[
+                     ["What plans did Baltimore have to transition from an industrial city to tourism?"],
+                     ["What type of market analysis and feasibility studies were performed for the 9 St. Marys Street project in Annapolis Maryland?"],
+                     ["What are the feasibility studies of moving Johns Hopkins Medicine admin departments back to the East Baltimore campus?"]
+                 ],
+                 outputs="text").launch(share=False)
+def predict(prompt):
+    config = load_config()
+    prompt_template = """
+    Answer the question based only on the following context:
+    {context}
+    ---
+    Answer the question based on the above context: {question}
+    """
+    hf_embed_func = HuggingFaceEmbeddings(
+        model_name="all-MiniLM-L6-v2",
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': False},
+        cache_folder=config['models']['model_cache_path']
+    )
+    db = Chroma(persist_directory=config['chroma_db']['chroma_path'],
+                embedding_function=hf_embed_func,
+                collection_name="jscholar_rag")
+    results = db.similarity_search_with_relevance_scores(prompt, k=7)
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    llm = HuggingFaceEndpoint(
+        repo_id="HuggingFaceH4/zephyr-7b-beta",
+        task="text-generation",
+        top_k=30,
+        temperature=0.1,
+        repetition_penalty=1.03,
+        max_new_tokens=512,
+    )
+    prompt_template_filled = PromptTemplate(
+        input_variables=[context_text, prompt], template=prompt_template
+    )
+    chat_model = LLMChain(llm=llm, prompt=prompt_template_filled)
+    response_text = chat_model.invoke({'question': prompt, 'context': context_text})
+    formatted_response = f"{response_text.get('text')}"
+    return formatted_response
+def load_config():
+    with open(CONFIG_PATH, 'r') as file:
+        loaded_data = yaml.safe_load(file)
+    return loaded_data
+if __name__ == "__main__":
+    main()

chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79962aaa138d440f9bf87ad8d22caf0520a21298473cf867fffc33dfbecb0124
+size 232964000

chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8e96a61d978f213efbfae84ab2dc0a67fb3cb08162b50eb1ac269e532ce320d
+size 100

chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9cddee6aa3f6cd2497c61e78e9cac2a943753223e5ab2a22c2a47a864bcf600d
+size 8502757

chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bf567f0d48a89d8ac48d52e5de92e086b5fa699ebfe3e7e18f7d33927cbddb2
+size 556000

chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b557aa42d5a8a941fc09cbf993bb8539ef2bfaa4761e94934c383f2b00986aa0
+size 1194112

chroma/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e49dc61d09537b88cc8b221cd447a74281fcd4fbf82a8926e4959019841acea
+size 893464576

chroma_mgmt_queries.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import chromadb
+from langchain_community.embeddings import HuggingFaceEmbeddings
+CHROMA_PATH = "chroma"
+def main():
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    collection = client.get_collection(name="jscholar_rag")
+    print(f"Total Embeddings: {collection.count()}")
+    print(collection.peek())
+    # Retrieve all documents
+    documents = collection.get()
+    pdfs = documents['metadatas']
+    pdfs_source = [item['source'] for item in pdfs]
+    prefix_to_remove = "C:\\Users\\tsande16\\PycharmProjects\\jscholar-rag\\raw-data\\add_data\\"
+    # Remove the prefix from each path in the list
+    pdf_names = sorted(list(set([path.replace(prefix_to_remove, '') for path in pdfs_source if path.startswith(prefix_to_remove)])))
+    # Sort PDFs by their name in alphabetical order
+    # pdfs_sorted = sorted(pdfs, key=lambda x: x['metadatas'])
+    # Print sorted PDF names
+    print("List of PDFs in the database (ordered by name):")
+    for pdf in pdf_names:
+        print(pdf)
+if __name__ == "__main__":
+    main()

config/default_config.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+chroma_db:
+  chroma_path: "chroma"
+data_load:
+  data_load_path: "C:\\Users\\tsande16\\PycharmProjects\\jscholar-rag\\raw-data\\edward-john-real-estate-program"
+models:
+  model_cache_path: "model_cache"

create_vector_db.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import argparse
+import chromadb
+from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema import Document
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores.chroma import Chroma
+import os
+import shutil
+import yaml
+CONFIG_PATH = os.path.join('config', 'default_config.yaml')
+def main():
+    config = load_config()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--action",
+                        dest="db_action",
+                        help="Values are: (C)reate New or (A)dd to Existing",
+                        required=True,
+                        choices=['c', 'a'])
+    parser.add_argument("--add-path",
+                        dest="add_path",
+                        help="Path to the documents you wish to add to the vector DB")
+    args = parser.parse_args()
+    if args.db_action == 'c':
+        generate_data_store(config['data_load']['data_load_path'])
+    elif args.db_action == 'a':
+        add_documents_to_data_store(args.add_path)
+def add_documents_to_data_store(data_path):
+    file_paths = traverse_directory(data_path)
+    for file in file_paths:
+        documents = load_documents(file)
+        chunks = split_text(documents)
+        add_to_chroma(chunks)
+def generate_data_store(data_path):
+    documents = load_documents(data_path)
+    chunks = split_text(documents)
+    save_to_chroma(chunks)
+def load_documents(document_data_path):
+    # loader = DirectoryLoader(document_data_path, silent_errors=True, glob="*.pdf")
+    loader = UnstructuredFileLoader(document_data_path)
+    documents = loader.load()
+    return documents
+def traverse_directory(directory):
+    file_paths = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    return file_paths
+def split_text(documents: list[Document]):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+        length_function=len,
+        add_start_index=True,
+    )
+    chunks = text_splitter.split_documents(documents)
+    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
+    document = chunks[10]
+    print(document.page_content)
+    print(document.metadata)
+    return chunks
+def save_to_chroma(chunks: list[Document]):
+    config = load_config()
+    # Bert Sentence Transformer Embeddings
+    # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+    hf_embed = HuggingFaceEmbeddings(
+        model_name="all-MiniLM-L6-v2",
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': False},
+        cache_folder=config['models']['model_cache_path']
+    )
+    # Create a new DB from the documents.
+    db = Chroma.from_documents(
+        documents=chunks,
+        embedding=hf_embed,
+        collection_name="jscholar_rag",
+        persist_directory=config['chroma_db']['chroma_path']
+    )
+    db.persist()
+    print(f"Saved {len(chunks)} chunks to {config['chroma_db']['chroma_path']}.")
+    # test the database:
+    query = "Why is Maryland appealing for solar users?"
+    docs = db.similarity_search(query)
+    # print results
+    print(docs[0].page_content)
+def add_to_chroma(chunks: list[Document]):
+    config = load_config()
+    hf_embed = HuggingFaceEmbeddings(
+        model_name="all-MiniLM-L6-v2",
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': False},
+        cache_folder=config['models']['model_cache_path']
+    )
+    db = Chroma(
+        embedding_function=hf_embed, collection_name="jscholar_rag", persist_directory=config['chroma_db']['chroma_path']
+    )
+    db.add_documents(documents=chunks)
+    db.persist()
+    print(f"Added {len(chunks)} new chunks to {config['chroma_db']['chroma_path']}.")
+def load_config():
+    with open(CONFIG_PATH, 'r') as file:
+        loaded_data = yaml.safe_load(file)
+    return loaded_data
+def setup_tokens():
+    config = load_config()
+    os.environ["HF_TOKEN"] = config['tokens']['hugging_face']
+if __name__ == "__main__":
+    main()

fine_tune_model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from datasets import load_dataset
+from transformers import AutoTokenizer, DataCollatorWithPadding
+raw_datasets = load_dataset("glue", "mrpc")
+checkpoint = "bert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+def tokenize_function(example):
+    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
+tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/8b3219a92973c328a8e22fadcfa821b5dc75636a/added_tokens.json ADDED Viewed

File without changes

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/added_tokens.json ADDED Viewed

File without changes

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main ADDED Viewed

	@@ -0,0 +1 @@


1	+ 8b3219a92973c328a8e22fadcfa821b5dc75636a

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/README.md ADDED Viewed

	@@ -0,0 +1,177 @@

+---
+language: en
+license: apache-2.0
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_xml
+- ms_marco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+pipeline_tag: sentence-similarity
+---
+# all-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Evaluation Results
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L6-v2)
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
+contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
+1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+We developed this model during the
+[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
+organized by Hugging Face. We developed this model as part of the project:
+[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
+## Intended uses
+Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
+the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
+By default, input text longer than 256 word pieces is truncated.
+## Training procedure
+### Pre-training
+We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
+We then apply the cross entropy loss by comparing with true pairs.
+#### Hyper parameters
+We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
+We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
+a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
+We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,170,060,424** |

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.8.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.0.0",
+    "transformers": "4.6.1",
+    "pytorch": "1.8.1"
+  }
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
+size 90868376

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/README.md ADDED Viewed

	@@ -0,0 +1,177 @@

+---
+language: en
+license: apache-2.0
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_xml
+- ms_marco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+pipeline_tag: sentence-similarity
+---
+# all-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+## Evaluation Results
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L6-v2)
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
+contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
+1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+We developed this model during the
+[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
+organized by Hugging Face. We developed this model as part of the project:
+[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
+## Intended uses
+Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
+the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
+By default, input text longer than 256 word pieces is truncated.
+## Training procedure
+### Pre-training
+We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
+We then apply the cross entropy loss by comparing with true pairs.
+#### Hyper parameters
+We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
+We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
+a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
+We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,170,060,424** |

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.8.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.0.0",
+    "transformers": "4.6.1",
+    "pytorch": "1.8.1"
+  }
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
+size 90868376

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}

model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

query.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import argparse
+import os
+from dataclasses import dataclass
+import chromadb
+import yaml
+from langchain.chains.llm import LLMChain
+from langchain.vectorstores.chroma import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
+from langchain_core.prompts import PromptTemplate
+CONFIG_PATH = os.path.join('config', 'default_config.yaml')
+CHROMA_PATH = "chroma"
+MODEL_CACHE = "model_cache"
+PROMPT_TEMPLATE = """
+Answer the question based only on the following context:
+{context}
+---
+Answer the question based on the above context: {question}
+"""
+def main():
+    # Create CLI.
+    parser = argparse.ArgumentParser()
+    parser.add_argument("query_text", type=str, help="The query text.")
+    args = parser.parse_args()
+    query_text = args.query_text
+    # Prepare the DB.
+    hf_embed_func = HuggingFaceEmbeddings(
+        model_name="all-MiniLM-L6-v2",
+        model_kwargs={'device': 'cpu'},
+        encode_kwargs={'normalize_embeddings': False},
+        cache_folder=MODEL_CACHE
+    )
+    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hf_embed_func, collection_name="jscholar_rag")
+    client = chromadb.PersistentClient(path=CHROMA_PATH)
+    collection = client.get_collection(name="jscholar_rag")
+    print(f"Total Embeddings: {collection.count()}")
+    print(collection.peek())
+    # Search the DB.
+    results = db.similarity_search_with_relevance_scores(query_text, k=5)
+    # results = db.similarity_search(query_text)
+    if len(results) == 0 or results[0][1] < 0.1:
+        print(f"Unable to find matching results.")
+        return
+    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+    # prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
+    prompt = PromptTemplate(
+        input_variables=[context_text, query_text], template=PROMPT_TEMPLATE
+    )
+    #prompt = prompt_template.format(context=context_text, question=query_text)
+    llm = HuggingFaceEndpoint(
+        repo_id="HuggingFaceH4/zephyr-7b-beta",
+        task="text-generation",
+        top_k=30,
+        temperature=0.1,
+        repetition_penalty=1.03,
+        max_new_tokens=512,
+    )
+    chat_model = LLMChain(prompt=prompt, llm=llm)
+    response_text = chat_model.invoke({'question': query_text, 'context': context_text})
+    sources = [doc.metadata.get("source", None) for doc, _score in results]
+    formatted_response = f"{response_text.get('text')}"
+    formatted_sources = f"Citations: {sources}"
+    print(formatted_response)
+    print(formatted_sources)
+def load_config():
+    with open(CONFIG_PATH, 'r') as file:
+        loaded_data = yaml.safe_load(file)
+    return loaded_data
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+langchain
+langchain-community
+unstructured[pdf]
+unstructured
+chromadb
+openai
+tiktoken

test.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from pdf2image import convert_from_path
+pdf = "C:\\Users\\tsande16\\PycharmProjects\\jscholar-rag\\raw-data\\edward-john-real-estate-program\\Abernathy_LibertyCenterVA_2002_Mueller.pdf"
+images = convert_from_path(pdf, 500,poppler_path=r'C:\Program Files (x86)\poppler-24.02.0\Library\bin')
+for i, image in enumerate(images):
+    fname = 'image'+str(i)+'.png'
+    image.save(fname, "PNG")