Spaces:
Sleeping
Sleeping
FAISS vector db for HF spaces compatibility
Browse files- fast_app.py +9 -12
- ingest.py +5 -7
- requirements.txt +7 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin +0 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin +0 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin +0 -3
- stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin +0 -0
- stores/czech_512/chroma.sqlite3 +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin +0 -3
- stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin +0 -3
- stores/english_512/chroma.sqlite3 +0 -3
fast_app.py
CHANGED
@@ -7,20 +7,15 @@ from fastapi.templating import Jinja2Templates
|
|
7 |
from fastapi.staticfiles import StaticFiles
|
8 |
from fastapi.encoders import jsonable_encoder
|
9 |
|
10 |
-
from
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
|
13 |
from langchain.chains import RetrievalQA
|
14 |
-
|
15 |
-
TextLoader,
|
16 |
-
PyPDFLoader,
|
17 |
-
DirectoryLoader,
|
18 |
-
UnstructuredFileLoader,
|
19 |
-
)
|
20 |
-
from langchain.document_loaders.csv_loader import CSVLoader
|
21 |
from langchain.llms import OpenAI
|
22 |
from langchain import PromptTemplate
|
23 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
|
|
24 |
|
25 |
from ingest import Ingest
|
26 |
|
@@ -31,7 +26,7 @@ from ingest import Ingest
|
|
31 |
# if huggingface_token is None:
|
32 |
# raise ValueError("Hugging Face token is not set in environment variables.")
|
33 |
|
34 |
-
openai_api_key =
|
35 |
if openai_api_key is None:
|
36 |
raise ValueError("OAI token is not set in environment variables.")
|
37 |
|
@@ -39,8 +34,8 @@ if openai_api_key is None:
|
|
39 |
app = FastAPI()
|
40 |
templates = Jinja2Templates(directory="templates")
|
41 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
42 |
-
english_embedding_model="text-embedding-3-large"
|
43 |
-
czech_embedding_model="Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
44 |
|
45 |
czech_store = "stores/czech_512"
|
46 |
english_store = "stores/english_512"
|
@@ -55,6 +50,7 @@ ingestor = Ingest(
|
|
55 |
english_embedding_model=english_embedding_model,
|
56 |
)
|
57 |
|
|
|
58 |
def prompt_en():
|
59 |
prompt_template_en = """You are electrical engineer and you answer users ###Question.
|
60 |
|
@@ -75,6 +71,7 @@ def prompt_en():
|
|
75 |
print("\n Prompt ready... \n\n")
|
76 |
return prompt_en
|
77 |
|
|
|
78 |
def prompt_cz():
|
79 |
prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
|
80 |
|
@@ -144,7 +141,7 @@ async def get_response(query: str = Form(...), language: str = Form(...)):
|
|
144 |
model=embedding_model,
|
145 |
)
|
146 |
|
147 |
-
vectordb =
|
148 |
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
|
149 |
|
150 |
chain_type_kwargs = {"prompt": prompt}
|
|
|
7 |
from fastapi.staticfiles import StaticFiles
|
8 |
from fastapi.encoders import jsonable_encoder
|
9 |
|
10 |
+
from langchain_community.vectorstores import FAISS
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
|
13 |
from langchain.chains import RetrievalQA
|
14 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
from langchain.llms import OpenAI
|
16 |
from langchain import PromptTemplate
|
17 |
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
|
18 |
+
import chainlit as cl
|
19 |
|
20 |
from ingest import Ingest
|
21 |
|
|
|
26 |
# if huggingface_token is None:
|
27 |
# raise ValueError("Hugging Face token is not set in environment variables.")
|
28 |
|
29 |
+
openai_api_key = "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"
|
30 |
if openai_api_key is None:
|
31 |
raise ValueError("OAI token is not set in environment variables.")
|
32 |
|
|
|
34 |
app = FastAPI()
|
35 |
templates = Jinja2Templates(directory="templates")
|
36 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
37 |
+
english_embedding_model = "text-embedding-3-large"
|
38 |
+
czech_embedding_model = "Seznam/simcse-dist-mpnet-paracrawl-cs-en"
|
39 |
|
40 |
czech_store = "stores/czech_512"
|
41 |
english_store = "stores/english_512"
|
|
|
50 |
english_embedding_model=english_embedding_model,
|
51 |
)
|
52 |
|
53 |
+
|
54 |
def prompt_en():
|
55 |
prompt_template_en = """You are electrical engineer and you answer users ###Question.
|
56 |
|
|
|
71 |
print("\n Prompt ready... \n\n")
|
72 |
return prompt_en
|
73 |
|
74 |
+
|
75 |
def prompt_cz():
|
76 |
prompt_template_cz = """Jste elektroinženýr a odpovídáte uživatelům na ###Otázku.
|
77 |
|
|
|
141 |
model=embedding_model,
|
142 |
)
|
143 |
|
144 |
+
vectordb = FAISS.load_local(persist_directory, embedding)
|
145 |
retriever = vectordb.as_retriever(search_kwargs={"k": 3})
|
146 |
|
147 |
chain_type_kwargs = {"prompt": prompt}
|
ingest.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
|
4 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
@@ -53,12 +53,11 @@ class Ingest:
|
|
53 |
)
|
54 |
texts = text_splitter.split_documents(documents)
|
55 |
|
56 |
-
vectordb =
|
57 |
documents=texts,
|
58 |
embedding=embedding,
|
59 |
-
persist_directory=self.english_store,
|
60 |
-
collection_metadata={"hnsw:space": "cosine"},
|
61 |
)
|
|
|
62 |
|
63 |
print("\n English vector Store Created.......\n\n")
|
64 |
|
@@ -84,12 +83,11 @@ class Ingest:
|
|
84 |
)
|
85 |
|
86 |
texts = text_splitter.split_documents(documents)
|
87 |
-
vectordb =
|
88 |
documents=texts,
|
89 |
embedding=embedding,
|
90 |
-
persist_directory=self.czech_store,
|
91 |
-
collection_metadata={"hnsw:space": "cosine"},
|
92 |
)
|
|
|
93 |
|
94 |
print("\n Czech vector Store Created.......\n\n")
|
95 |
|
|
|
1 |
+
from langchain_community.vectorstores import FAISS
|
2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
|
4 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
|
|
53 |
)
|
54 |
texts = text_splitter.split_documents(documents)
|
55 |
|
56 |
+
vectordb = FAISS.from_documents(
|
57 |
documents=texts,
|
58 |
embedding=embedding,
|
|
|
|
|
59 |
)
|
60 |
+
vectordb.save_local(self.english_store)
|
61 |
|
62 |
print("\n English vector Store Created.......\n\n")
|
63 |
|
|
|
83 |
)
|
84 |
|
85 |
texts = text_splitter.split_documents(documents)
|
86 |
+
vectordb = FAISS.from_documents(
|
87 |
documents=texts,
|
88 |
embedding=embedding,
|
|
|
|
|
89 |
)
|
90 |
+
vectordb.save_local(self.czech_store)
|
91 |
|
92 |
print("\n Czech vector Store Created.......\n\n")
|
93 |
|
requirements.txt
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
-
|
|
|
|
|
2 |
fastapi
|
|
|
|
|
3 |
uvicorn
|
4 |
python-multipart
|
5 |
ctransformers
|
@@ -9,8 +13,8 @@ sentence_transformers
|
|
9 |
chromadb
|
10 |
pytesseract
|
11 |
fitz
|
12 |
-
libpff-python
|
13 |
openai
|
14 |
tiktoken
|
15 |
frontend
|
16 |
-
|
|
|
1 |
+
|
2 |
+
langchain-community==0.0.19
|
3 |
+
langchain==0.1.6
|
4 |
fastapi
|
5 |
+
faiss-cpu
|
6 |
+
pypdf
|
7 |
uvicorn
|
8 |
python-multipart
|
9 |
ctransformers
|
|
|
13 |
chromadb
|
14 |
pytesseract
|
15 |
fitz
|
16 |
+
#libpff-python
|
17 |
openai
|
18 |
tiktoken
|
19 |
frontend
|
20 |
+
chainlit
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/data_level0.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:5f8157971983f837eca48b97187f0e8a435eb21270cd49d831db21678670bc4a
|
3 |
-
size 1164000
|
|
|
|
|
|
|
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/header.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9a3499aedbeb5c8ea26813ed567be6748293334099aa733c4d8cf0c4ec0ee6e3
|
3 |
-
size 100
|
|
|
|
|
|
|
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/length.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:612e017796cdd9eef6ba562cbe8c02e16b8c07f3fbac9f1254934f02e2261084
|
3 |
-
size 4000
|
|
|
|
|
|
|
|
stores/czech_512/9b9472a9-9f91-4b34-880b-b7752517675a/link_lists.bin
DELETED
File without changes
|
stores/czech_512/chroma.sqlite3
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2187862ccdfdb78565366853a939dc50038908171936c8584d69a09b55aa4e7c
|
3 |
-
size 1929216
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/data_level0.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6f812eacc9c05db367748cf1e0576bdcd28e0b3eaf09d5f3095a1b0e03f71cc8
|
3 |
-
size 12428000
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/header.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9882e5d786d4ca5fba4a783054685cf6e05b1637aaf586e43ec0e933e30e961d
|
3 |
-
size 100
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/index_metadata.pickle
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d49c7e9538b2cfc154773a96a1fcdbf4a4247c3b510bb68d2aa6f2b24e902fca
|
3 |
-
size 55974
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/length.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:bd6e73e535a8843ce30d35a4ba88436bcb5687583474e276a3b1f8689c1477bd
|
3 |
-
size 4000
|
|
|
|
|
|
|
|
stores/english_512/3af5c10a-ea06-4cbe-beaf-8497680ad526/link_lists.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:fe35f087195e70122f597edc9b62da9d3ce370b40307b5556ebbe4e185fb46d4
|
3 |
-
size 8624
|
|
|
|
|
|
|
|
stores/english_512/chroma.sqlite3
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:369ede691a1330113d353e1a425a7cd24ad9d76ee61ee542adab1f12a6887146
|
3 |
-
size 26963968
|
|
|
|
|
|
|
|