Spaces:
Runtime error
Runtime error
tim-sanders
commited on
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .gitignore +10 -0
- README.md +3 -9
- chat-ui.py +93 -0
- chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/data_level0.bin +3 -0
- chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/header.bin +3 -0
- chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/index_metadata.pickle +3 -0
- chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/length.bin +3 -0
- chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/link_lists.bin +3 -0
- chroma/chroma.sqlite3 +3 -0
- chroma_mgmt_queries.py +34 -0
- config/default_config.yaml +8 -0
- create_vector_db.py +142 -0
- fine_tune_model.py +14 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/8b3219a92973c328a8e22fadcfa821b5dc75636a/added_tokens.json +0 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/added_tokens.json +0 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main +1 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/1_Pooling/config.json +7 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/README.md +177 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config.json +24 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config_sentence_transformers.json +7 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/model.safetensors +3 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/modules.json +20 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/sentence_bert_config.json +4 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/special_tokens_map.json +1 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer.json +0 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer_config.json +1 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/vocab.txt +0 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/1_Pooling/config.json +7 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/README.md +177 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config.json +24 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config_sentence_transformers.json +7 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/model.safetensors +3 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/modules.json +20 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/sentence_bert_config.json +4 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/special_tokens_map.json +1 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer.json +0 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer_config.json +1 -0
- model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/vocab.txt +0 -0
- query.py +89 -0
- requirements.txt +7 -0
- test.py +8 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.xml
|
2 |
+
*.iml
|
3 |
+
.idea/
|
4 |
+
.venv
|
5 |
+
.venv2
|
6 |
+
./config/default_config.yaml
|
7 |
+
./model_cache
|
8 |
+
./raw-data
|
9 |
+
./chroma
|
10 |
+
hf_*/
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: JScholar_RAG_Prototype
|
3 |
+
app_file: chat-ui.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
+
sdk_version: 4.38.1
|
|
|
|
|
6 |
---
|
|
|
|
chat-ui.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import yaml
|
4 |
+
from langchain.chains.llm import LLMChain
|
5 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
6 |
+
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
|
7 |
+
from langchain_community.vectorstores import Chroma
|
8 |
+
from langchain_core.prompts import PromptTemplate
|
9 |
+
|
10 |
+
CONFIG_PATH = os.path.join('config', 'default_config.yaml')
|
11 |
+
|
12 |
+
|
13 |
+
def main():
|
14 |
+
config = load_config()
|
15 |
+
title = "Ask a Johns Hopkins Librarian!"
|
16 |
+
description = """
|
17 |
+
This chat bot is an expert on the <a href=https://jscholarship.library.jhu.edu/handle/1774.2/35703>Edward St. John
|
18 |
+
Real Estate Program - Practicum Projects</a> collection. It will answer any question regarding these papers!
|
19 |
+
<img src="https://jscholarship.library.jhu.edu/assets/j10p/images/libraries.logo.small.horizontal.white.png"
|
20 |
+
width=200px>
|
21 |
+
"""
|
22 |
+
article = """
|
23 |
+
This Retrieval Augmented Retrieval (RAG) chat bot is designed to answer questions only on the
|
24 |
+
<a href="https://jscholarship.library.jhu.edu/handle/1774.2/35703">Edward St. John Real Estate Program - Practicum Projects</a> collection
|
25 |
+
"""
|
26 |
+
|
27 |
+
os.environ["HUGGINGFACEHUB_API_TOKEN"] = config['tokens']['hugging_face']
|
28 |
+
gr.Interface(fn=predict,
|
29 |
+
inputs="text",
|
30 |
+
title=title,
|
31 |
+
description=description,
|
32 |
+
article=article,
|
33 |
+
examples=[
|
34 |
+
["What plans did Baltimore have to transition from an industrial city to tourism?"],
|
35 |
+
["What type of market analysis and feasibility studies were performed for the 9 St. Marys Street project in Annapolis Maryland?"],
|
36 |
+
["What are the feasibility studies of moving Johns Hopkins Medicine admin departments back to the East Baltimore campus?"]
|
37 |
+
],
|
38 |
+
outputs="text").launch(share=False)
|
39 |
+
|
40 |
+
|
41 |
+
def predict(prompt):
|
42 |
+
config = load_config()
|
43 |
+
|
44 |
+
prompt_template = """
|
45 |
+
Answer the question based only on the following context:
|
46 |
+
|
47 |
+
{context}
|
48 |
+
|
49 |
+
---
|
50 |
+
|
51 |
+
Answer the question based on the above context: {question}
|
52 |
+
"""
|
53 |
+
|
54 |
+
hf_embed_func = HuggingFaceEmbeddings(
|
55 |
+
model_name="all-MiniLM-L6-v2",
|
56 |
+
model_kwargs={'device': 'cpu'},
|
57 |
+
encode_kwargs={'normalize_embeddings': False},
|
58 |
+
cache_folder=config['models']['model_cache_path']
|
59 |
+
)
|
60 |
+
db = Chroma(persist_directory=config['chroma_db']['chroma_path'],
|
61 |
+
embedding_function=hf_embed_func,
|
62 |
+
collection_name="jscholar_rag")
|
63 |
+
|
64 |
+
results = db.similarity_search_with_relevance_scores(prompt, k=7)
|
65 |
+
|
66 |
+
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
|
67 |
+
|
68 |
+
llm = HuggingFaceEndpoint(
|
69 |
+
repo_id="HuggingFaceH4/zephyr-7b-beta",
|
70 |
+
task="text-generation",
|
71 |
+
top_k=30,
|
72 |
+
temperature=0.1,
|
73 |
+
repetition_penalty=1.03,
|
74 |
+
max_new_tokens=512,
|
75 |
+
)
|
76 |
+
prompt_template_filled = PromptTemplate(
|
77 |
+
input_variables=[context_text, prompt], template=prompt_template
|
78 |
+
)
|
79 |
+
chat_model = LLMChain(llm=llm, prompt=prompt_template_filled)
|
80 |
+
response_text = chat_model.invoke({'question': prompt, 'context': context_text})
|
81 |
+
formatted_response = f"{response_text.get('text')}"
|
82 |
+
return formatted_response
|
83 |
+
|
84 |
+
|
85 |
+
def load_config():
|
86 |
+
with open(CONFIG_PATH, 'r') as file:
|
87 |
+
loaded_data = yaml.safe_load(file)
|
88 |
+
|
89 |
+
return loaded_data
|
90 |
+
|
91 |
+
|
92 |
+
if __name__ == "__main__":
|
93 |
+
main()
|
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/data_level0.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79962aaa138d440f9bf87ad8d22caf0520a21298473cf867fffc33dfbecb0124
|
3 |
+
size 232964000
|
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/header.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d8e96a61d978f213efbfae84ab2dc0a67fb3cb08162b50eb1ac269e532ce320d
|
3 |
+
size 100
|
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/index_metadata.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cddee6aa3f6cd2497c61e78e9cac2a943753223e5ab2a22c2a47a864bcf600d
|
3 |
+
size 8502757
|
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/length.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0bf567f0d48a89d8ac48d52e5de92e086b5fa699ebfe3e7e18f7d33927cbddb2
|
3 |
+
size 556000
|
chroma/7d2cc428-ccc5-4d3f-88be-2bd54cbd6b88/link_lists.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b557aa42d5a8a941fc09cbf993bb8539ef2bfaa4761e94934c383f2b00986aa0
|
3 |
+
size 1194112
|
chroma/chroma.sqlite3
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e49dc61d09537b88cc8b221cd447a74281fcd4fbf82a8926e4959019841acea
|
3 |
+
size 893464576
|
chroma_mgmt_queries.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import chromadb
|
2 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
+
|
4 |
+
CHROMA_PATH = "chroma"
|
5 |
+
|
6 |
+
|
7 |
+
def main():
|
8 |
+
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
9 |
+
collection = client.get_collection(name="jscholar_rag")
|
10 |
+
print(f"Total Embeddings: {collection.count()}")
|
11 |
+
print(collection.peek())
|
12 |
+
|
13 |
+
# Retrieve all documents
|
14 |
+
documents = collection.get()
|
15 |
+
|
16 |
+
pdfs = documents['metadatas']
|
17 |
+
pdfs_source = [item['source'] for item in pdfs]
|
18 |
+
|
19 |
+
prefix_to_remove = "C:\\Users\\tsande16\\PycharmProjects\\jscholar-rag\\raw-data\\add_data\\"
|
20 |
+
|
21 |
+
# Remove the prefix from each path in the list
|
22 |
+
pdf_names = sorted(list(set([path.replace(prefix_to_remove, '') for path in pdfs_source if path.startswith(prefix_to_remove)])))
|
23 |
+
|
24 |
+
# Sort PDFs by their name in alphabetical order
|
25 |
+
# pdfs_sorted = sorted(pdfs, key=lambda x: x['metadatas'])
|
26 |
+
|
27 |
+
# Print sorted PDF names
|
28 |
+
print("List of PDFs in the database (ordered by name):")
|
29 |
+
for pdf in pdf_names:
|
30 |
+
print(pdf)
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
main()
|
config/default_config.yaml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
chroma_db:
|
2 |
+
chroma_path: "chroma"
|
3 |
+
|
4 |
+
data_load:
|
5 |
+
data_load_path: "C:\\Users\\tsande16\\PycharmProjects\\jscholar-rag\\raw-data\\edward-john-real-estate-program"
|
6 |
+
|
7 |
+
models:
|
8 |
+
model_cache_path: "model_cache"
|
create_vector_db.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import chromadb
|
3 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.schema import Document
|
6 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
7 |
+
from langchain.vectorstores.chroma import Chroma
|
8 |
+
import os
|
9 |
+
import shutil
|
10 |
+
import yaml
|
11 |
+
|
12 |
+
CONFIG_PATH = os.path.join('config', 'default_config.yaml')
|
13 |
+
|
14 |
+
|
15 |
+
def main():
|
16 |
+
config = load_config()
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument("--action",
|
19 |
+
dest="db_action",
|
20 |
+
help="Values are: (C)reate New or (A)dd to Existing",
|
21 |
+
required=True,
|
22 |
+
choices=['c', 'a'])
|
23 |
+
parser.add_argument("--add-path",
|
24 |
+
dest="add_path",
|
25 |
+
help="Path to the documents you wish to add to the vector DB")
|
26 |
+
|
27 |
+
args = parser.parse_args()
|
28 |
+
if args.db_action == 'c':
|
29 |
+
generate_data_store(config['data_load']['data_load_path'])
|
30 |
+
elif args.db_action == 'a':
|
31 |
+
add_documents_to_data_store(args.add_path)
|
32 |
+
|
33 |
+
|
34 |
+
def add_documents_to_data_store(data_path):
|
35 |
+
file_paths = traverse_directory(data_path)
|
36 |
+
for file in file_paths:
|
37 |
+
documents = load_documents(file)
|
38 |
+
chunks = split_text(documents)
|
39 |
+
add_to_chroma(chunks)
|
40 |
+
|
41 |
+
|
42 |
+
def generate_data_store(data_path):
|
43 |
+
documents = load_documents(data_path)
|
44 |
+
chunks = split_text(documents)
|
45 |
+
save_to_chroma(chunks)
|
46 |
+
|
47 |
+
|
48 |
+
def load_documents(document_data_path):
|
49 |
+
# loader = DirectoryLoader(document_data_path, silent_errors=True, glob="*.pdf")
|
50 |
+
loader = UnstructuredFileLoader(document_data_path)
|
51 |
+
documents = loader.load()
|
52 |
+
return documents
|
53 |
+
|
54 |
+
|
55 |
+
def traverse_directory(directory):
|
56 |
+
file_paths = []
|
57 |
+
for root, dirs, files in os.walk(directory):
|
58 |
+
for file in files:
|
59 |
+
file_path = os.path.join(root, file)
|
60 |
+
file_paths.append(file_path)
|
61 |
+
return file_paths
|
62 |
+
|
63 |
+
|
64 |
+
def split_text(documents: list[Document]):
|
65 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
66 |
+
chunk_size=500,
|
67 |
+
chunk_overlap=50,
|
68 |
+
length_function=len,
|
69 |
+
add_start_index=True,
|
70 |
+
)
|
71 |
+
chunks = text_splitter.split_documents(documents)
|
72 |
+
print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
|
73 |
+
|
74 |
+
document = chunks[10]
|
75 |
+
print(document.page_content)
|
76 |
+
print(document.metadata)
|
77 |
+
|
78 |
+
return chunks
|
79 |
+
|
80 |
+
|
81 |
+
def save_to_chroma(chunks: list[Document]):
|
82 |
+
config = load_config()
|
83 |
+
# Bert Sentence Transformer Embeddings
|
84 |
+
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
85 |
+
hf_embed = HuggingFaceEmbeddings(
|
86 |
+
model_name="all-MiniLM-L6-v2",
|
87 |
+
model_kwargs={'device': 'cpu'},
|
88 |
+
encode_kwargs={'normalize_embeddings': False},
|
89 |
+
cache_folder=config['models']['model_cache_path']
|
90 |
+
)
|
91 |
+
|
92 |
+
# Create a new DB from the documents.
|
93 |
+
db = Chroma.from_documents(
|
94 |
+
documents=chunks,
|
95 |
+
embedding=hf_embed,
|
96 |
+
collection_name="jscholar_rag",
|
97 |
+
persist_directory=config['chroma_db']['chroma_path']
|
98 |
+
)
|
99 |
+
db.persist()
|
100 |
+
|
101 |
+
print(f"Saved {len(chunks)} chunks to {config['chroma_db']['chroma_path']}.")
|
102 |
+
|
103 |
+
# test the database:
|
104 |
+
query = "Why is Maryland appealing for solar users?"
|
105 |
+
docs = db.similarity_search(query)
|
106 |
+
|
107 |
+
# print results
|
108 |
+
print(docs[0].page_content)
|
109 |
+
|
110 |
+
|
111 |
+
def add_to_chroma(chunks: list[Document]):
|
112 |
+
config = load_config()
|
113 |
+
hf_embed = HuggingFaceEmbeddings(
|
114 |
+
model_name="all-MiniLM-L6-v2",
|
115 |
+
model_kwargs={'device': 'cpu'},
|
116 |
+
encode_kwargs={'normalize_embeddings': False},
|
117 |
+
cache_folder=config['models']['model_cache_path']
|
118 |
+
)
|
119 |
+
|
120 |
+
db = Chroma(
|
121 |
+
embedding_function=hf_embed, collection_name="jscholar_rag", persist_directory=config['chroma_db']['chroma_path']
|
122 |
+
)
|
123 |
+
db.add_documents(documents=chunks)
|
124 |
+
db.persist()
|
125 |
+
|
126 |
+
print(f"Added {len(chunks)} new chunks to {config['chroma_db']['chroma_path']}.")
|
127 |
+
|
128 |
+
|
129 |
+
def load_config():
|
130 |
+
with open(CONFIG_PATH, 'r') as file:
|
131 |
+
loaded_data = yaml.safe_load(file)
|
132 |
+
|
133 |
+
return loaded_data
|
134 |
+
|
135 |
+
|
136 |
+
def setup_tokens():
|
137 |
+
config = load_config()
|
138 |
+
os.environ["HF_TOKEN"] = config['tokens']['hugging_face']
|
139 |
+
|
140 |
+
|
141 |
+
if __name__ == "__main__":
|
142 |
+
main()
|
fine_tune_model.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import AutoTokenizer, DataCollatorWithPadding
|
3 |
+
|
4 |
+
raw_datasets = load_dataset("glue", "mrpc")
|
5 |
+
checkpoint = "bert-base-uncased"
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
7 |
+
|
8 |
+
|
9 |
+
def tokenize_function(example):
|
10 |
+
return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
|
11 |
+
|
12 |
+
|
13 |
+
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
|
14 |
+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/8b3219a92973c328a8e22fadcfa821b5dc75636a/added_tokens.json
ADDED
File without changes
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/added_tokens.json
ADDED
File without changes
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/refs/main
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
8b3219a92973c328a8e22fadcfa821b5dc75636a
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: en
|
3 |
+
license: apache-2.0
|
4 |
+
library_name: sentence-transformers
|
5 |
+
tags:
|
6 |
+
- sentence-transformers
|
7 |
+
- feature-extraction
|
8 |
+
- sentence-similarity
|
9 |
+
- transformers
|
10 |
+
datasets:
|
11 |
+
- s2orc
|
12 |
+
- flax-sentence-embeddings/stackexchange_xml
|
13 |
+
- ms_marco
|
14 |
+
- gooaq
|
15 |
+
- yahoo_answers_topics
|
16 |
+
- code_search_net
|
17 |
+
- search_qa
|
18 |
+
- eli5
|
19 |
+
- snli
|
20 |
+
- multi_nli
|
21 |
+
- wikihow
|
22 |
+
- natural_questions
|
23 |
+
- trivia_qa
|
24 |
+
- embedding-data/sentence-compression
|
25 |
+
- embedding-data/flickr30k-captions
|
26 |
+
- embedding-data/altlex
|
27 |
+
- embedding-data/simple-wiki
|
28 |
+
- embedding-data/QQP
|
29 |
+
- embedding-data/SPECTER
|
30 |
+
- embedding-data/PAQ_pairs
|
31 |
+
- embedding-data/WikiAnswers
|
32 |
+
pipeline_tag: sentence-similarity
|
33 |
+
---
|
34 |
+
|
35 |
+
|
36 |
+
# all-MiniLM-L6-v2
|
37 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
38 |
+
|
39 |
+
## Usage (Sentence-Transformers)
|
40 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
41 |
+
|
42 |
+
```
|
43 |
+
pip install -U sentence-transformers
|
44 |
+
```
|
45 |
+
|
46 |
+
Then you can use the model like this:
|
47 |
+
```python
|
48 |
+
from sentence_transformers import SentenceTransformer
|
49 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
50 |
+
|
51 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
52 |
+
embeddings = model.encode(sentences)
|
53 |
+
print(embeddings)
|
54 |
+
```
|
55 |
+
|
56 |
+
## Usage (HuggingFace Transformers)
|
57 |
+
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
58 |
+
|
59 |
+
```python
|
60 |
+
from transformers import AutoTokenizer, AutoModel
|
61 |
+
import torch
|
62 |
+
import torch.nn.functional as F
|
63 |
+
|
64 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
65 |
+
def mean_pooling(model_output, attention_mask):
|
66 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
67 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
68 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
69 |
+
|
70 |
+
|
71 |
+
# Sentences we want sentence embeddings for
|
72 |
+
sentences = ['This is an example sentence', 'Each sentence is converted']
|
73 |
+
|
74 |
+
# Load model from HuggingFace Hub
|
75 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
76 |
+
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
77 |
+
|
78 |
+
# Tokenize sentences
|
79 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
80 |
+
|
81 |
+
# Compute token embeddings
|
82 |
+
with torch.no_grad():
|
83 |
+
model_output = model(**encoded_input)
|
84 |
+
|
85 |
+
# Perform pooling
|
86 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
87 |
+
|
88 |
+
# Normalize embeddings
|
89 |
+
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
90 |
+
|
91 |
+
print("Sentence embeddings:")
|
92 |
+
print(sentence_embeddings)
|
93 |
+
```
|
94 |
+
|
95 |
+
## Evaluation Results
|
96 |
+
|
97 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L6-v2)
|
98 |
+
|
99 |
+
------
|
100 |
+
|
101 |
+
## Background
|
102 |
+
|
103 |
+
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
|
104 |
+
contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
|
105 |
+
1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
|
106 |
+
|
107 |
+
We developed this model during the
|
108 |
+
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
|
109 |
+
organized by Hugging Face. We developed this model as part of the project:
|
110 |
+
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
|
111 |
+
|
112 |
+
## Intended uses
|
113 |
+
|
114 |
+
Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
|
115 |
+
the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
|
116 |
+
|
117 |
+
By default, input text longer than 256 word pieces is truncated.
|
118 |
+
|
119 |
+
|
120 |
+
## Training procedure
|
121 |
+
|
122 |
+
### Pre-training
|
123 |
+
|
124 |
+
We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
|
125 |
+
|
126 |
+
### Fine-tuning
|
127 |
+
|
128 |
+
We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
|
129 |
+
We then apply the cross entropy loss by comparing with true pairs.
|
130 |
+
|
131 |
+
#### Hyper parameters
|
132 |
+
|
133 |
+
We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
|
134 |
+
We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
|
135 |
+
a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
|
136 |
+
|
137 |
+
#### Training data
|
138 |
+
|
139 |
+
We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
|
140 |
+
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
|
141 |
+
|
142 |
+
|
143 |
+
| Dataset | Paper | Number of training tuples |
|
144 |
+
|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
|
145 |
+
| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
|
146 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
|
147 |
+
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
|
148 |
+
| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
|
149 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
|
150 |
+
| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
|
151 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
|
152 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
|
153 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
|
154 |
+
| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
|
155 |
+
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
|
156 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
|
157 |
+
| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
|
158 |
+
| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
|
159 |
+
| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
|
160 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
|
161 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
|
162 |
+
| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
|
163 |
+
| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
|
164 |
+
| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
|
165 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
|
166 |
+
| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
|
167 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
|
168 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
|
169 |
+
| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
|
170 |
+
| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
|
171 |
+
| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
|
172 |
+
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
|
173 |
+
| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
|
174 |
+
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
|
175 |
+
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
|
176 |
+
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
|
177 |
+
| **Total** | | **1,170,060,424** |
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 384,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 1536,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 6,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"transformers_version": "4.8.2",
|
21 |
+
"type_vocab_size": 2,
|
22 |
+
"use_cache": true,
|
23 |
+
"vocab_size": 30522
|
24 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/config_sentence_transformers.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.0.0",
|
4 |
+
"transformers": "4.6.1",
|
5 |
+
"pytorch": "1.8.1"
|
6 |
+
}
|
7 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
|
3 |
+
size 90868376
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 256,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/8b3219a92973c328a8e22fadcfa821b5dc75636a/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/1_Pooling/config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 384,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false
|
7 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/README.md
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: en
|
3 |
+
license: apache-2.0
|
4 |
+
library_name: sentence-transformers
|
5 |
+
tags:
|
6 |
+
- sentence-transformers
|
7 |
+
- feature-extraction
|
8 |
+
- sentence-similarity
|
9 |
+
- transformers
|
10 |
+
datasets:
|
11 |
+
- s2orc
|
12 |
+
- flax-sentence-embeddings/stackexchange_xml
|
13 |
+
- ms_marco
|
14 |
+
- gooaq
|
15 |
+
- yahoo_answers_topics
|
16 |
+
- code_search_net
|
17 |
+
- search_qa
|
18 |
+
- eli5
|
19 |
+
- snli
|
20 |
+
- multi_nli
|
21 |
+
- wikihow
|
22 |
+
- natural_questions
|
23 |
+
- trivia_qa
|
24 |
+
- embedding-data/sentence-compression
|
25 |
+
- embedding-data/flickr30k-captions
|
26 |
+
- embedding-data/altlex
|
27 |
+
- embedding-data/simple-wiki
|
28 |
+
- embedding-data/QQP
|
29 |
+
- embedding-data/SPECTER
|
30 |
+
- embedding-data/PAQ_pairs
|
31 |
+
- embedding-data/WikiAnswers
|
32 |
+
pipeline_tag: sentence-similarity
|
33 |
+
---
|
34 |
+
|
35 |
+
|
36 |
+
# all-MiniLM-L6-v2
|
37 |
+
This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
|
38 |
+
|
39 |
+
## Usage (Sentence-Transformers)
|
40 |
+
Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
|
41 |
+
|
42 |
+
```
|
43 |
+
pip install -U sentence-transformers
|
44 |
+
```
|
45 |
+
|
46 |
+
Then you can use the model like this:
|
47 |
+
```python
|
48 |
+
from sentence_transformers import SentenceTransformer
|
49 |
+
sentences = ["This is an example sentence", "Each sentence is converted"]
|
50 |
+
|
51 |
+
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
52 |
+
embeddings = model.encode(sentences)
|
53 |
+
print(embeddings)
|
54 |
+
```
|
55 |
+
|
56 |
+
## Usage (HuggingFace Transformers)
|
57 |
+
Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
|
58 |
+
|
59 |
+
```python
|
60 |
+
from transformers import AutoTokenizer, AutoModel
|
61 |
+
import torch
|
62 |
+
import torch.nn.functional as F
|
63 |
+
|
64 |
+
#Mean Pooling - Take attention mask into account for correct averaging
|
65 |
+
def mean_pooling(model_output, attention_mask):
|
66 |
+
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
67 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
68 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
69 |
+
|
70 |
+
|
71 |
+
# Sentences we want sentence embeddings for
|
72 |
+
sentences = ['This is an example sentence', 'Each sentence is converted']
|
73 |
+
|
74 |
+
# Load model from HuggingFace Hub
|
75 |
+
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
76 |
+
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
|
77 |
+
|
78 |
+
# Tokenize sentences
|
79 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
80 |
+
|
81 |
+
# Compute token embeddings
|
82 |
+
with torch.no_grad():
|
83 |
+
model_output = model(**encoded_input)
|
84 |
+
|
85 |
+
# Perform pooling
|
86 |
+
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
87 |
+
|
88 |
+
# Normalize embeddings
|
89 |
+
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
90 |
+
|
91 |
+
print("Sentence embeddings:")
|
92 |
+
print(sentence_embeddings)
|
93 |
+
```
|
94 |
+
|
95 |
+
## Evaluation Results
|
96 |
+
|
97 |
+
For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=sentence-transformers/all-MiniLM-L6-v2)
|
98 |
+
|
99 |
+
------
|
100 |
+
|
101 |
+
## Background
|
102 |
+
|
103 |
+
The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
|
104 |
+
contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
|
105 |
+
1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
|
106 |
+
|
107 |
+
We developed this model during the
|
108 |
+
[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
|
109 |
+
organized by Hugging Face. We developed this model as part of the project:
|
110 |
+
[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
|
111 |
+
|
112 |
+
## Intended uses
|
113 |
+
|
114 |
+
Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
|
115 |
+
the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
|
116 |
+
|
117 |
+
By default, input text longer than 256 word pieces is truncated.
|
118 |
+
|
119 |
+
|
120 |
+
## Training procedure
|
121 |
+
|
122 |
+
### Pre-training
|
123 |
+
|
124 |
+
We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
|
125 |
+
|
126 |
+
### Fine-tuning
|
127 |
+
|
128 |
+
We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
|
129 |
+
We then apply the cross entropy loss by comparing with true pairs.
|
130 |
+
|
131 |
+
#### Hyper parameters
|
132 |
+
|
133 |
+
We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
|
134 |
+
We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
|
135 |
+
a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
|
136 |
+
|
137 |
+
#### Training data
|
138 |
+
|
139 |
+
We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
|
140 |
+
We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
|
141 |
+
|
142 |
+
|
143 |
+
| Dataset | Paper | Number of training tuples |
|
144 |
+
|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
|
145 |
+
| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
|
146 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
|
147 |
+
| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
|
148 |
+
| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
|
149 |
+
| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
|
150 |
+
| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
|
151 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
|
152 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
|
153 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
|
154 |
+
| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
|
155 |
+
| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
|
156 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
|
157 |
+
| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
|
158 |
+
| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
|
159 |
+
| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
|
160 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
|
161 |
+
| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
|
162 |
+
| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
|
163 |
+
| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
|
164 |
+
| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
|
165 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
|
166 |
+
| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
|
167 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
|
168 |
+
| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
|
169 |
+
| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
|
170 |
+
| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
|
171 |
+
| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
|
172 |
+
| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
|
173 |
+
| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
|
174 |
+
| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
|
175 |
+
| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
|
176 |
+
| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
|
177 |
+
| **Total** | | **1,170,060,424** |
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"gradient_checkpointing": false,
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 384,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 1536,
|
13 |
+
"layer_norm_eps": 1e-12,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"model_type": "bert",
|
16 |
+
"num_attention_heads": 12,
|
17 |
+
"num_hidden_layers": 6,
|
18 |
+
"pad_token_id": 0,
|
19 |
+
"position_embedding_type": "absolute",
|
20 |
+
"transformers_version": "4.8.2",
|
21 |
+
"type_vocab_size": 2,
|
22 |
+
"use_cache": true,
|
23 |
+
"vocab_size": 30522
|
24 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config_sentence_transformers.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.0.0",
|
4 |
+
"transformers": "4.6.1",
|
5 |
+
"pytorch": "1.8.1"
|
6 |
+
}
|
7 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
|
3 |
+
size 90868376
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 256,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
|
model_cache/models--sentence-transformers--all-MiniLM-L6-v2/snapshots/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
query.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import os
|
3 |
+
from dataclasses import dataclass
|
4 |
+
|
5 |
+
import chromadb
|
6 |
+
import yaml
|
7 |
+
from langchain.chains.llm import LLMChain
|
8 |
+
from langchain.vectorstores.chroma import Chroma
|
9 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
+
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
|
11 |
+
from langchain_core.prompts import PromptTemplate
|
12 |
+
|
13 |
+
CONFIG_PATH = os.path.join('config', 'default_config.yaml')
|
14 |
+
CHROMA_PATH = "chroma"
|
15 |
+
MODEL_CACHE = "model_cache"
|
16 |
+
PROMPT_TEMPLATE = """
|
17 |
+
Answer the question based only on the following context:
|
18 |
+
|
19 |
+
{context}
|
20 |
+
|
21 |
+
---
|
22 |
+
|
23 |
+
Answer the question based on the above context: {question}
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
def main():
|
28 |
+
# Create CLI.
|
29 |
+
parser = argparse.ArgumentParser()
|
30 |
+
parser.add_argument("query_text", type=str, help="The query text.")
|
31 |
+
args = parser.parse_args()
|
32 |
+
query_text = args.query_text
|
33 |
+
|
34 |
+
# Prepare the DB.
|
35 |
+
hf_embed_func = HuggingFaceEmbeddings(
|
36 |
+
model_name="all-MiniLM-L6-v2",
|
37 |
+
model_kwargs={'device': 'cpu'},
|
38 |
+
encode_kwargs={'normalize_embeddings': False},
|
39 |
+
cache_folder=MODEL_CACHE
|
40 |
+
)
|
41 |
+
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=hf_embed_func, collection_name="jscholar_rag")
|
42 |
+
|
43 |
+
client = chromadb.PersistentClient(path=CHROMA_PATH)
|
44 |
+
collection = client.get_collection(name="jscholar_rag")
|
45 |
+
print(f"Total Embeddings: {collection.count()}")
|
46 |
+
print(collection.peek())
|
47 |
+
|
48 |
+
# Search the DB.
|
49 |
+
results = db.similarity_search_with_relevance_scores(query_text, k=5)
|
50 |
+
# results = db.similarity_search(query_text)
|
51 |
+
if len(results) == 0 or results[0][1] < 0.1:
|
52 |
+
print(f"Unable to find matching results.")
|
53 |
+
return
|
54 |
+
|
55 |
+
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
|
56 |
+
# prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
|
57 |
+
prompt = PromptTemplate(
|
58 |
+
input_variables=[context_text, query_text], template=PROMPT_TEMPLATE
|
59 |
+
)
|
60 |
+
#prompt = prompt_template.format(context=context_text, question=query_text)
|
61 |
+
|
62 |
+
llm = HuggingFaceEndpoint(
|
63 |
+
repo_id="HuggingFaceH4/zephyr-7b-beta",
|
64 |
+
task="text-generation",
|
65 |
+
top_k=30,
|
66 |
+
temperature=0.1,
|
67 |
+
repetition_penalty=1.03,
|
68 |
+
max_new_tokens=512,
|
69 |
+
)
|
70 |
+
chat_model = LLMChain(prompt=prompt, llm=llm)
|
71 |
+
|
72 |
+
response_text = chat_model.invoke({'question': query_text, 'context': context_text})
|
73 |
+
|
74 |
+
sources = [doc.metadata.get("source", None) for doc, _score in results]
|
75 |
+
formatted_response = f"{response_text.get('text')}"
|
76 |
+
formatted_sources = f"Citations: {sources}"
|
77 |
+
print(formatted_response)
|
78 |
+
print(formatted_sources)
|
79 |
+
|
80 |
+
|
81 |
+
def load_config():
|
82 |
+
with open(CONFIG_PATH, 'r') as file:
|
83 |
+
loaded_data = yaml.safe_load(file)
|
84 |
+
|
85 |
+
return loaded_data
|
86 |
+
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
langchain-community
|
3 |
+
unstructured[pdf]
|
4 |
+
unstructured
|
5 |
+
chromadb
|
6 |
+
openai
|
7 |
+
tiktoken
|
test.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdf2image import convert_from_path
|
2 |
+
|
3 |
+
pdf = "C:\\Users\\tsande16\\PycharmProjects\\jscholar-rag\\raw-data\\edward-john-real-estate-program\\Abernathy_LibertyCenterVA_2002_Mueller.pdf"
|
4 |
+
|
5 |
+
images = convert_from_path(pdf, 500,poppler_path=r'C:\Program Files (x86)\poppler-24.02.0\Library\bin')
|
6 |
+
for i, image in enumerate(images):
|
7 |
+
fname = 'image'+str(i)+'.png'
|
8 |
+
image.save(fname, "PNG")
|